In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime

In [None]:
drivers_df = pd.read_csv('Data/drivers.csv')
races_df = pd.read_csv('Data/races.csv')
lap_times_df = pd.read_csv('Data/lap_times.csv')
pit_stops_df = pd.read_csv('Data/pit_stops.csv')
results_df = pd.read_csv('Data/results.csv')
constructors_df = pd.read_csv('Data/constructors.csv')

In [None]:
races_df['date'] = pd.to_datetime(races_df['date'])
drivers_df['dob'] = pd.to_datetime(drivers_df['dob'])

In [None]:
#for numerical data, we are using median 
numerical_columns = lap_times_df.select_dtypes(include=[np.number]).columns
lap_times_df[numerical_columns] = lap_times_df[numerical_columns].fillna(lap_times_df[numerical_columns].median())
        
# For categorical columns, fill with mode
categorical_columns = drivers_df.select_dtypes(include=['object']).columns
drivers_df[categorical_columns] = drivers_df[categorical_columns].fillna(drivers_df[categorical_columns].mode().iloc[0])

In [None]:
races_df['year'] = pd.to_datetime(races_df['date']).dt.year
avg_lap_times = lap_times_df.groupby(['raceId', 'driverId'])['milliseconds'].mean().reset_index()

In [None]:
# Lap time distribution over time
plt.figure(figsize=(12, 8))
sns.boxplot(data=lap_times_df, y='milliseconds')
plt.title('Distribution of Lap Times')
plt.ylabel('Time (milliseconds)')

In [None]:
 # Lap time trends over years
merged_data = pd.merge(lap_times_df, races_df[['raceId', 'year']], on='raceId')
yearly_avg = merged_data.groupby('year')['milliseconds'].mean().reset_index()
        
plt.figure(figsize=(12, 6))
plt.plot(yearly_avg['year'], yearly_avg['milliseconds']/1000, marker='o')
plt.title('Average Lap Times by Year')
plt.xlabel('Year')
plt.ylabel('Average Lap Time (seconds)')
plt.grid(True)

In [None]:
# Position changes throughout races
position_changes = lap_times_df.groupby('driverId')['position'].agg(['mean', 'std']).reset_index()
plt.figure(figsize=(12, 6))
plt.scatter(position_changes['mean'], position_changes['std'])
plt.title('Position Consistency Analysis')
plt.xlabel('Average Position')
plt.ylabel('Position Variation (std)')

In [None]:
 # Driver nationality distribution
plt.figure(figsize=(12, 6))
drivers_df['nationality'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Driver Nationalities')
plt.xlabel('Nationality')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()

In [None]:
df = pd.read_csv("Data/races.csv")
races_by_year = df.groupby('year').size()
plt.figure(figsize=(12, 6))
races_by_year.plot(kind='line', marker='o')
plt.title('Number of Races per Year')
plt.xlabel('Year')
plt.ylabel('Number of Races')
plt.grid(True)

In [None]:
#the top constructors by total points so far
df1 = pd.read_csv("Data/results.csv")
df2 = pd.read_csv("Data/constructors.csv")
results_df = pd.merge(df1,df2,on='constructorId')
constructor_points = results_df.groupby('name')['points'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 6))
constructor_points.plot(kind='bar')
plt.title('Top 10 Constructors by Total Points')
plt.xlabel('Constructor')
plt.ylabel('Total Points')
plt.xticks(rotation=45)
plt.tight_layout()

In [None]:
# Correlation between lap times and position
correlation_data = lap_times_df[['milliseconds', 'position']].copy()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Lap Times and Position')

In [None]:
# Merge lap times with races and drivers
merged_lap_data = pd.merge(pd.merge(lap_times_df, drivers_df, on='driverId'),races_df[['raceId', 'year', 'name']], on='raceId')
# Calculate yearly performance metrics
yearly_stats = merged_lap_data.groupby('year').agg({'milliseconds': ['mean', 'std', 'count'],'position': 'mean'}).reset_index()
        
yearly_stats

In [None]:
report = {
            'total_races': len(races_df),
            'total_drivers': len(drivers_df),
            'avg_lap_time': lap_times_df['milliseconds'].mean() / 1000,  # in seconds
            'fastest_lap': lap_times_df['milliseconds'].min() / 1000,  # in seconds
            'total_laps_recorded': len(lap_times_df),
            'years_covered': f"{races_df['year'].min()} - {races_df['year'].max()}"
        }
report