In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, log_loss, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report

# Data preprocessing

Load the pickled working dataset obtained from data_extraction.ipynb.

In [None]:
def getfile(f, **kwargs):
  loc = 'https://drive.google.com/uc?export=download&id='+f.split('/')[-2]
  out = pd.read_pickle(loc, **kwargs)
  return out

f = "https://drive.google.com/file/d/1pltF9KHbAy4UPCin6W_qI317A6-xaCdT/view?usp=drive_link"
df = getfile(f)
df.head()

Unnamed: 0,Area Name,Area Type,Year,Month,Date_Numeric,Seasonally Adjusted(Y/N),Status,Labor Force,Employment,Unemployment,Unemployment Rate,Benchmark
0,California,State,1976,January,1976-01-01,N,Final,9672362,8668016,1004346,10.4,2020
1,California,State,1976,January,1976-01-01,Y,Final,9774280,8875685,898595,9.2,2020
2,California,State,1976,February,1976-02-01,N,Final,9684440,8704564,979876,10.1,2020
3,California,State,1976,February,1976-02-01,Y,Final,9768885,8871553,897332,9.2,2020
4,California,State,1976,March,1976-03-01,N,Final,9689626,8776344,913282,9.4,2020


First cast DateTime objects in 'Date_Numeric' column to a monthly frequency. Then convert to float values.

In [None]:
df['Date'] = df['Date_Numeric'].dt.to_period('M')
df['Date'] = df['Date'].dt.to_timestamp().apply(lambda x : pd.Timestamp(x).to_julian_date())
df.head()

Unnamed: 0,Area Name,Area Type,Year,Month,Date_Numeric,Seasonally Adjusted(Y/N),Status,Labor Force,Employment,Unemployment,Unemployment Rate,Benchmark,Date
0,California,State,1976,January,1976-01-01,N,Final,9672362,8668016,1004346,10.4,2020,2442778.5
1,California,State,1976,January,1976-01-01,Y,Final,9774280,8875685,898595,9.2,2020,2442778.5
2,California,State,1976,February,1976-02-01,N,Final,9684440,8704564,979876,10.1,2020,2442809.5
3,California,State,1976,February,1976-02-01,Y,Final,9768885,8871553,897332,9.2,2020,2442809.5
4,California,State,1976,March,1976-03-01,N,Final,9689626,8776344,913282,9.4,2020,2442838.5


Drop redundant columns.

In [None]:
df.drop(['Year', 'Month', 'Date_Numeric'], axis='columns', inplace=True)
df.head()

Unnamed: 0,Area Name,Area Type,Seasonally Adjusted(Y/N),Status,Labor Force,Employment,Unemployment,Unemployment Rate,Benchmark,Date
0,California,State,N,Final,9672362,8668016,1004346,10.4,2020,2442778.5
1,California,State,Y,Final,9774280,8875685,898595,9.2,2020,2442778.5
2,California,State,N,Final,9684440,8704564,979876,10.1,2020,2442809.5
3,California,State,Y,Final,9768885,8871553,897332,9.2,2020,2442809.5
4,California,State,N,Final,9689626,8776344,913282,9.4,2020,2442838.5


Convert categorical variables to numerical values using get_dummies() to produce one-hot encodings. Drop one dummy variable in each category to prevent multicollinearity.

In [None]:
categorical_cols = ['Area Name', 'Area Type', 'Seasonally Adjusted(Y/N)', 'Status', 'Benchmark']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df.head()

Unnamed: 0,Labor Force,Employment,Unemployment,Unemployment Rate,Date,Area Name_Adelanto city,Area Name_Agoura Hills city,Area Name_Alameda County,Area Name_Alameda city,Area Name_Alamo CDP,...,Area Name_Yuba County,Area Name_Yucaipa city,Area Name_Yucca Valley town,Area Type_Metropolitan Area,Area Type_State,Area Type_Sub-County Place,Seasonally Adjusted(Y/N)_Y,Status_Preliminary,Benchmark_2021,Benchmark_2023
0,9672362,8668016,1004346,10.4,2442778.5,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,9774280,8875685,898595,9.2,2442778.5,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
2,9684440,8704564,979876,10.1,2442809.5,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,9768885,8871553,897332,9.2,2442809.5,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
4,9689626,8776344,913282,9.4,2442838.5,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [None]:
df.shape

(204096, 1032)

# Linear regression

We wish to predict unemployment rate using the other features.

In [None]:
X = df.drop('Unemployment Rate', axis='columns')
y = df['Unemployment Rate']

Scale the features to have a mean of 0 and standard deviation of 1, then train
and evaluate the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_scaler = StandardScaler()

X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('mse loss: ', mean_squared_error(y_test, y_pred))
print('mae loss: ', mean_absolute_error(y_test, y_pred))
print('r2 score: ', r2_score(y_test, y_pred))

mse loss:  9.092278357697737
mae loss:  2.2087119212727684
r2 score:  0.8330856170363987


# Logistic regression

In [None]:
df

Unnamed: 0,Labor Force,Employment,Unemployment,Unemployment Rate,Date,Area Name_Adelanto city,Area Name_Agoura Hills city,Area Name_Alameda County,Area Name_Alameda city,Area Name_Alamo CDP,...,Area Name_Yuba County,Area Name_Yucaipa city,Area Name_Yucca Valley town,Area Type_Metropolitan Area,Area Type_State,Area Type_Sub-County Place,Seasonally Adjusted(Y/N)_Y,Status_Preliminary,Benchmark_2021,Benchmark_2023
0,9672362,8668016,1004346,10.4,2442778.5,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,9774280,8875685,898595,9.2,2442778.5,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
2,9684440,8704564,979876,10.1,2442809.5,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,9768885,8871553,897332,9.2,2442809.5,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
4,9689626,8776344,913282,9.4,2442838.5,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204091,9100,8200,900,9.9,2460554.5,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
204092,1000,900,100,8.8,2460554.5,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
204093,4900,4600,300,6.3,2460554.5,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
204094,7300,6900,500,6.4,2460554.5,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True


we wish to predict unemployment using the logistic regression.



First, we need to standardize the value of unemployemnt rate to [0,1]

In [None]:
scaler = MinMaxScaler()
df['Unemployment Rate'] = scaler.fit_transform(df[['Unemployment Rate']])

Secondly, we need to make a column of "Unemployment class" to predict.

In [None]:
# Define binary class
# Class 0: Unemployment rate <=0.5, Class 1 : Unemployement rate > 0.5
df['Unemployment class'] = df['Unemployment Rate'].apply(lambda x: 0 if x <= 0.5 else 1)

In [None]:
df

Unnamed: 0,Labor Force,Employment,Unemployment,Unemployment Rate,Date,Area Name_Adelanto city,Area Name_Agoura Hills city,Area Name_Alameda County,Area Name_Alameda city,Area Name_Alamo CDP,...,Area Name_Yucaipa city,Area Name_Yucca Valley town,Area Type_Metropolitan Area,Area Type_State,Area Type_Sub-County Place,Seasonally Adjusted(Y/N)_Y,Status_Preliminary,Benchmark_2021,Benchmark_2023,Unemployment class
0,9672362,8668016,1004346,0.104,2442778.5,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,0
1,9774280,8875685,898595,0.092,2442778.5,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,0
2,9684440,8704564,979876,0.101,2442809.5,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,0
3,9768885,8871553,897332,0.092,2442809.5,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,0
4,9689626,8776344,913282,0.094,2442838.5,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204091,9100,8200,900,0.099,2460554.5,False,False,False,False,False,...,False,False,False,False,True,False,True,False,True,0
204092,1000,900,100,0.088,2460554.5,False,False,False,False,False,...,False,False,False,False,True,False,True,False,True,0
204093,4900,4600,300,0.063,2460554.5,False,False,False,False,False,...,False,False,False,False,True,False,True,False,True,0
204094,7300,6900,500,0.064,2460554.5,False,False,False,False,False,...,False,False,False,False,True,False,True,False,True,0


Make some X and Y to predict

In [None]:
X = df.drop(columns=['Unemployment Rate', 'Unemployment class'])
y = df['Unemployment class']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print classification metrics
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.9989955903968643
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     40726
           1       0.84      0.69      0.76        94

    accuracy                           1.00     40820
   macro avg       0.92      0.85      0.88     40820
weighted avg       1.00      1.00      1.00     40820

