In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

In [None]:
chunk_size = 100000

filtered_chunks = []

for chunk in pd.read_csv("/content/drive/MyDrive/6242Project/Data/US_Accidents_March23.csv", chunksize=chunk_size):
    filtered_chunk = chunk[chunk['State'] == 'GA']
    filtered_chunks.append(filtered_chunk)

df_ga = pd.concat(filtered_chunks, ignore_index=True)

In [None]:
len(df_ga)

169234

In [None]:
# target is severity
y = df_ga['Severity']
X = df_ga.iloc[:, 11:]
X.drop(columns=['State', 'Country', 'Timezone', 'Airport_Code'], inplace = True)

In [None]:
# Want target at the end because that's how I like it for cleaning/making new columns
X['Severity'] = y

In [None]:
# Get day of week
X['Weather_Timestamp'] = pd.to_datetime(X['Weather_Timestamp'])

In [None]:
X['day_of_week'] = X['Weather_Timestamp'].dt.day_name()

In [None]:
def is_weekday(day):
    weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
    return day in weekdays

# Apply the function to the day_of_week column to create the Weekday column
X['Weekday'] = X['day_of_week'].apply(lambda x: is_weekday(x))

In [None]:
X.drop(columns=['Weather_Timestamp', 'day_of_week'], inplace = True)

### Missing Data Handling

In [None]:
X.isna().sum()

Street                     415
City                         0
County                       0
Zipcode                      0
Temperature(F)            3158
Wind_Chill(F)            61020
Humidity(%)               3333
Pressure(in)              2567
Visibility(mi)            2967
Wind_Direction            3042
Wind_Speed(mph)          16133
Precipitation(in)        63998
Weather_Condition         2705
Amenity                      0
Bump                         0
Crossing                     0
Give_Way                     0
Junction                     0
No_Exit                      0
Railway                      0
Roundabout                   0
Station                      0
Stop                         0
Traffic_Calming              0
Traffic_Signal               0
Turning_Loop                 0
Sunrise_Sunset             159
Civil_Twilight             159
Nautical_Twilight          159
Astronomical_Twilight      159
Severity                     0
Weekday                      0
dtype: i

In [None]:
# missing a lot of windchill data, calculate that by 35.74 + 0.6215T – 35.75(V0.16) + 0.4275T(V0.16) where T is temp, V is windspeed
wind_chill_values = 35.74 + 0.6215 * X['Temperature(F)'] - 35.75 * (X['Wind_Speed(mph)'] ** 0.16) + 0.4275 * X['Temperature(F)'] * (X['Wind_Speed(mph)'] ** 0.16)

# Fill missing wind chill values with the calculated values
X['Wind_Chill(F)'].fillna(wind_chill_values, inplace=True)

In [None]:
# we are going to fill missing precip and wind with 0 assuming it's likely due to not having any precip or wind
X['Precipitation(in)'].fillna(0,inplace=True)
X['Wind_Speed(mph)'].fillna(0,inplace=True)

# Also drop wind direction, it's useless
X.drop(columns=['Wind_Direction'], inplace=True)

In [None]:
# Going to drop the missing data as we should have plenty of data without it from here on out
X.dropna(inplace=True)

In [None]:
X.isna().sum()

Street                   0
City                     0
County                   0
Zipcode                  0
Temperature(F)           0
Wind_Chill(F)            0
Humidity(%)              0
Pressure(in)             0
Visibility(mi)           0
Wind_Direction           0
Wind_Speed(mph)          0
Precipitation(in)        0
Weather_Condition        0
Amenity                  0
Bump                     0
Crossing                 0
Give_Way                 0
Junction                 0
No_Exit                  0
Railway                  0
Roundabout               0
Station                  0
Stop                     0
Traffic_Calming          0
Traffic_Signal           0
Turning_Loop             0
Sunrise_Sunset           0
Civil_Twilight           0
Nautical_Twilight        0
Astronomical_Twilight    0
Severity                 0
Weekday                  0
dtype: int64

In [None]:
len(X)

151180

### Combining Weather condition categories into fewer categories

In [None]:
X['Weather_Condition'] = X['Weather_Condition'].str.lower()

# Replace entries containing "rain" with "rain"
X.loc[X['Weather_Condition'].str.contains('rain'), 'Weather_Condition'] = 'rain'
X.loc[X['Weather_Condition'].str.contains('shower'), 'Weather_Condition'] = 'rain'
X.loc[X['Weather_Condition'].str.contains('drizzle'), 'Weather_Condition'] = 'rain'
X.loc[X['Weather_Condition'].str.contains('mist'), 'Weather_Condition'] = 'rain'
X.loc[X['Weather_Condition'].str.contains('precip'), 'Weather_Condition'] = 'rain'
X.loc[X['Weather_Condition'].str.contains('snow'), 'Weather_Condition'] = 'winter weather'
X.loc[X['Weather_Condition'].str.contains('wintry'), 'Weather_Condition'] = 'winter weather'
X.loc[X['Weather_Condition'].str.contains('ice'), 'Weather_Condition'] = 'winter weather'
X.loc[X['Weather_Condition'].str.contains('cloud'), 'Weather_Condition'] = 'cloudy'
X.loc[X['Weather_Condition'].str.contains('overcast'), 'Weather_Condition'] = 'cloudy'
X.loc[X['Weather_Condition'].str.contains('storm'), 'Weather_Condition'] = 'storm'
X.loc[X['Weather_Condition'].str.contains('fair'), 'Weather_Condition'] = 'clear'
X.loc[X['Weather_Condition'].str.contains('fog'), 'Weather_Condition'] = 'fog'
X.loc[X['Weather_Condition'].str.contains('smoke'), 'Weather_Condition'] = 'fog'
X.loc[X['Weather_Condition'].str.contains('haze'), 'Weather_Condition'] = 'fog'
X.loc[X['Weather_Condition'].str.contains('wind'), 'Weather_Condition'] = 'windy'
X.loc[X['Weather_Condition'].str.contains('squall'), 'Weather_Condition'] = 'windy'
X.loc[X['Weather_Condition'].str.contains('storm'), 'Weather_Condition'] = 'storm'
X.loc[X['Weather_Condition'].str.contains('thunder'), 'Weather_Condition'] = 'storm'

In [None]:
#Drop some categories that are too large, for city, perhaps add data for population of a city
# Also want to change specific street data to type of street, i.e. interstate, one-way road, highway, etc.

X.drop(columns=['Street','City','County','Zipcode','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight'],inplace=True)

### Work on some basic Modeling

In [None]:
X

Unnamed: 0,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition,Amenity,Bump,...,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Severity,Weekday
0,63.0,62.318240,97.0,29.75,3.0,9.2,0.05,rain,False,False,...,False,False,False,False,False,False,False,Day,3,True
1,63.0,63.213126,90.0,29.73,3.0,5.8,0.04,rain,False,False,...,False,False,False,False,False,False,False,Day,3,True
2,63.0,62.571880,90.0,29.73,2.5,8.1,0.62,rain,False,False,...,False,False,False,False,False,False,False,Day,3,True
3,63.0,62.069103,97.0,29.77,9.0,10.4,0.01,cloudy,False,False,...,False,False,False,False,False,False,False,Day,2,True
4,63.0,62.571880,97.0,29.70,10.0,8.1,0.13,rain,False,False,...,False,False,False,False,False,False,False,Day,3,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169229,80.0,80.000000,74.0,28.94,10.0,5.0,0.00,clear,False,False,...,False,False,False,False,False,False,False,Night,2,True
169230,85.0,85.000000,63.0,28.92,10.0,5.0,0.00,cloudy,False,False,...,False,False,False,False,False,True,False,Day,2,True
169231,85.0,85.000000,63.0,28.92,10.0,5.0,0.00,cloudy,False,False,...,False,False,False,False,False,True,False,Day,2,True
169232,80.0,80.000000,74.0,28.94,10.0,5.0,0.00,clear,False,False,...,False,False,False,False,False,False,False,Night,2,True


In [None]:
# Deal with any encoding for sklearns LogRegression

boolean_columns = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
                    'Traffic_Signal', 'Turning_Loop', 'Weekday']
X[boolean_columns] = X[boolean_columns].astype(int)

one_hot_encoded = pd.get_dummies(X['Weather_Condition'], prefix='weather')

# Concatenate the one-hot encoded DataFrame with the original DataFrame
X = pd.concat([X, one_hot_encoded], axis=1)

# Drop the original 'weather' column
X.drop(columns=['Weather_Condition'], inplace=True)


#Sunrise_Sunset 1 for day, 0 for night
X['Sunrise_Sunset'] = X['Sunrise_Sunset'].map({'Day': 1, 'Night': 0})

X

Unnamed: 0,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Amenity,Bump,Crossing,...,Sunrise_Sunset,Severity,Weekday,weather_clear,weather_cloudy,weather_fog,weather_rain,weather_storm,weather_windy,weather_winter weather
0,63.0,62.318240,97.0,29.75,3.0,9.2,0.05,0,0,0,...,1,3,1,0,0,0,1,0,0,0
1,63.0,63.213126,90.0,29.73,3.0,5.8,0.04,0,0,0,...,1,3,1,0,0,0,1,0,0,0
2,63.0,62.571880,90.0,29.73,2.5,8.1,0.62,0,0,0,...,1,3,1,0,0,0,1,0,0,0
3,63.0,62.069103,97.0,29.77,9.0,10.4,0.01,0,0,0,...,1,2,1,0,1,0,0,0,0,0
4,63.0,62.571880,97.0,29.70,10.0,8.1,0.13,0,0,0,...,1,3,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169229,80.0,80.000000,74.0,28.94,10.0,5.0,0.00,0,0,0,...,0,2,1,1,0,0,0,0,0,0
169230,85.0,85.000000,63.0,28.92,10.0,5.0,0.00,0,0,0,...,1,2,1,0,1,0,0,0,0,0
169231,85.0,85.000000,63.0,28.92,10.0,5.0,0.00,0,0,0,...,1,2,1,0,1,0,0,0,0,0
169232,80.0,80.000000,74.0,28.94,10.0,5.0,0.00,0,0,0,...,0,2,1,1,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

y = X['Severity']
X.drop(columns=['Severity'],inplace=True)

from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Standardize each column in X_train and convert it back to a DataFrame
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

last_9_columns = X_scaled.iloc[:, -22:]

positive_to_1 = lambda x: int(1) if x > 0 else int(0)

# Apply the lambda function element-wise to the last 9 columns
last_9_columns = last_9_columns.applymap(positive_to_1)

# Update the original DataFrame with the modified last 9 columns
X_scaled.iloc[:, -22:] = last_9_columns
X_scaled.iloc[:, -22:] = X_scaled.iloc[:, -22:].astype(int)

#split data 70/30
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.7, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

# Create an instance of the LogisticRegression model
logistic_reg = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=10000,n_jobs=-1,penalty='l1')

# Fit the model to the training data
logistic_reg.fit(X_train, y_train)


In [None]:
# Predict the labels for the test data
y_pred = logistic_reg.predict(X_test)

# Evaluate the model
accuracy = logistic_reg.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.6231644397407065
