In [11]:
import pandas as pd

In [12]:
df = pd.read_csv("/teamspace/studios/this_studio/Assignment-TechstaX/data/train_data.csv")
df.sample(5)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Description,City,County,State,Timezone,Temperature(F),...,Stop,Traffic_Calming,Traffic_Signal,Weather_Group,Wind_Direction_Group,Year,Month,Day,Time_Of_Day,Duration_Minutes
314040,2,39.203579,-76.692322,0.0,Right hand shoulder blocked due to accident on...,Linthicum Heights,Anne Arundel,MD,US/Eastern,75.0,...,False,False,False,Clear,Calm,2021,6,29,Night,91.53
1824968,2,38.885941,-77.447319,0.897,Stationary traffic on VA-28 S from US-50 (VA-2...,Chantilly,Fairfax,VA,US/Eastern,53.0,...,False,False,True,Cloudy,East,2022,4,4,Noon,166.0
4767026,2,39.89122,-121.57766,0.0,Accident on Skyway at Lovelock Rd.,Magalia,Butte,CA,US/Pacific,55.4,...,False,False,False,Cloudy,South,2018,1,4,Morning,29.85
443694,2,40.725201,-89.523361,0.0,Lane blocked due to accident on IL-116 Main St...,Washington,Tazewell,IL,US/Central,25.0,...,False,False,False,Cloudy,West,2021,1,11,Night,26.38
3586368,2,27.99806,-80.581787,0.01,Accident on FL-514 Malabar Rd at Glatter Rd.,Malabar,Brevard,FL,US/Eastern,73.9,...,False,False,False,Clear,East,2016,11,3,Morning,29.08


In [13]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer


# Convert "None" values to NaN for categorical columns
categorical_cols_with_missing = ['City', 'Timezone']
for col in categorical_cols_with_missing:
    df[col] = df[col].replace('None', pd.NA)

# Select numerical columns with missing values
numerical_cols_with_missing = [
    'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 
    'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)'
]

# Extract the subset of the dataframe with these columns
numerical_df = df[numerical_cols_with_missing]

# Print the number of missing values before numerical imputation
print("Missing values before numerical imputation:")
print(numerical_df.isna().sum())

# Initialize the IterativeImputer with Scikit-Learn
imputer = IterativeImputer(max_iter=10, random_state=0)

# Fit and transform the numerical data
imputed_numerical_df = pd.DataFrame(imputer.fit_transform(numerical_df), columns=numerical_df.columns)

# Replace the original numerical columns with the imputed values
df[numerical_cols_with_missing] = imputed_numerical_df

# Print the number of missing values after numerical imputation
print("\nMissing values after numerical imputation:")
print(df[numerical_cols_with_missing].isna().sum())

# Handle categorical columns with mode imputation using SimpleImputer
# Convert "None" values to NaN again for categorical columns
for col in categorical_cols_with_missing:
    df[col] = df[col].replace('None', pd.NA)

# Initialize the SimpleImputer with the most frequent strategy
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Print the number of missing values before categorical imputation
print("\nMissing values before categorical imputation:")
print(df[categorical_cols_with_missing].isna().sum())

# Fit and transform the categorical data
imputed_categorical_df = pd.DataFrame(categorical_imputer.fit_transform(df[categorical_cols_with_missing]), columns=categorical_cols_with_missing)

# Replace the original categorical columns with the imputed values
df[categorical_cols_with_missing] = imputed_categorical_df

# Print the number of missing values after categorical imputation
print("\nMissing values after categorical imputation:")
print(df[categorical_cols_with_missing].isna().sum())

# Print the rows with missing values in numerical columns
print("\nRows with missing values in numerical columns:")
print(df[df[numerical_cols_with_missing].isnull().any(axis=1)])

# Print the rows with missing values in categorical columns
print("\nRows with missing values in categorical columns:")
print(df[df[categorical_cols_with_missing].isnull().any(axis=1)])

# Display the first few rows of the imputed dataframe
print("\nFirst few rows of the imputed dataframe:")
df.head()

Missing values before numerical imputation:
Temperature(F)        163853
Humidity(%)           174144
Pressure(in)          140679
Visibility(mi)        177098
Wind_Speed(mph)       571233
Precipitation(in)    2203586
dtype: int64

Missing values after numerical imputation:
Temperature(F)       0
Humidity(%)          0
Pressure(in)         0
Visibility(mi)       0
Wind_Speed(mph)      0
Precipitation(in)    0
dtype: int64

Missing values before categorical imputation:
City         253
Timezone    7808
dtype: int64

Missing values after categorical imputation:
City        0
Timezone    0
dtype: int64

Rows with missing values in numerical columns:
Empty DataFrame
Columns: [Severity, Start_Lat, Start_Lng, Distance(mi), Description, City, County, State, Timezone, Temperature(F), Humidity(%), Pressure(in), Visibility(mi), Wind_Speed(mph), Precipitation(in), Amenity, Bump, Crossing, Give_Way, Junction, No_Exit, Railway, Roundabout, Station, Stop, Traffic_Calming, Traffic_Signal, Weather_Gro

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Description,City,County,State,Timezone,Temperature(F),...,Stop,Traffic_Calming,Traffic_Signal,Weather_Group,Wind_Direction_Group,Year,Month,Day,Time_Of_Day,Duration_Minutes
0,3,40.10059,-82.925194,0.01,Right hand shoulder blocked due to accident on...,Westerville,Franklin,OH,US/Eastern,37.4,...,False,False,False,Rain,South,2016,2,8,Morning,30.0
1,3,39.932709,-82.83091,0.01,One lane blocked due to accident on I-70 Westb...,Reynoldsburg,Franklin,OH,US/Eastern,37.4,...,False,False,False,Rain,South,2016,2,8,Morning,30.0
2,3,40.023487,-82.994888,0.01,Accident on I-71 Northbound at Exit 113 Silver...,Columbus,Franklin,OH,US/Eastern,37.0,...,False,False,False,Cloudy,West,2016,2,8,Noon,45.0
3,2,40.158024,-82.641762,1.32,Accident on County Hwy-16 Sportsman Club Rd be...,Johnstown,Licking,OH,US/Eastern,37.9,...,False,False,False,Snow,West,2016,2,8,Noon,30.0
4,2,39.775303,-84.200523,0.0,Accident on Helena St at OH-48 Main St.,Dayton,Montgomery,OH,US/Eastern,39.9,...,False,False,True,Cloudy,West,2016,2,8,Noon,45.0


In [15]:
df['Description'].fillna("Description is not available", inplace=True)

In [16]:
print(df.isna().sum())

Severity                0
Start_Lat               0
Start_Lng               0
Distance(mi)            0
Description             0
City                    0
County                  0
State                   0
Timezone                0
Temperature(F)          0
Humidity(%)             0
Pressure(in)            0
Visibility(mi)          0
Wind_Speed(mph)         0
Precipitation(in)       0
Amenity                 0
Bump                    0
Crossing                0
Give_Way                0
Junction                0
No_Exit                 0
Railway                 0
Roundabout              0
Station                 0
Stop                    0
Traffic_Calming         0
Traffic_Signal          0
Weather_Group           0
Wind_Direction_Group    0
Year                    0
Month                   0
Day                     0
Time_Of_Day             0
Duration_Minutes        0
dtype: int64


In [17]:
# Save the imputed dataframe to a new CSV file if needed
df.to_csv('/teamspace/studios/this_studio/Assignment-TechstaX/data/imputed_dataset.csv', index=False)

In [18]:
df.columns

Index(['Severity', 'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Description',
       'City', 'County', 'State', 'Timezone', 'Temperature(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Weather_Group',
       'Wind_Direction_Group', 'Year', 'Month', 'Day', 'Time_Of_Day',
       'Duration_Minutes'],
      dtype='object')

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = '/teamspace/studios/this_studio/Assignment-TechstaX/data/imputed_dataset.csv'  # Use the imputed dataset
df = pd.read_csv(file_path)

# Set the style for seaborn
sns.set(style="whitegrid")

# Define the folder to save the plots
output_folder = '/teamspace/studios/this_studio/Assignment-TechstaX/plots'
os.makedirs(output_folder, exist_ok=True)

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Visualize and save numerical columns
if len(numerical_cols) > 0:
    for col in numerical_cols:
        # Create a figure for the current numerical column
        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))
        
        # Histogram
        sns.histplot(df[col], kde=True, ax=axes[0])
        axes[0].set_title(f'Histogram of {col}')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        
        # Box Plot
        sns.boxplot(y=df[col], ax=axes[1])
        axes[1].set_title(f'Box Plot of {col}')
        axes[1].set_ylabel(col)
        
        # Adjust layout
        plt.tight_layout()
        
        # Save the plot
        plot_filename = os.path.join(output_folder, f'{col}_distribution.png')
        plt.savefig(plot_filename, bbox_inches='tight')
        
        # Close the plot to free up memory
        plt.close(fig)

# Visualize and save categorical columns
if len(categorical_cols) > 0:
    for col in categorical_cols:
        # Create a figure for the current categorical column
        fig, ax = plt.subplots(figsize=(14, 6))
        
        # Bar Plot
        sns.countplot(y=df[col], ax=ax, order=df[col].value_counts().index)
        ax.set_title(f'Bar Plot of {col}')
        ax.set_xlabel('Count')
        ax.set_ylabel(col)
        
        # Adjust layout
        plt.tight_layout()
        
        # Save the plot
        plot_filename = os.path.join(output_folder, f'{col}_barplot.png')
        plt.savefig(plot_filename, bbox_inches='tight')
        
        # Close the plot to free up memory
        plt.close(fig)

print(f"All plots have been saved to the folder: {output_folder}")