### Logistic Regression
#### DSC 672 - Group 5 - Impact Genome
Julia Aptekar, DePaul University, japtekar@depaul.edu

John Leniart, DePaul University, jleniart@depaul.edu

Arham Mehdi, DePaul University kmehdi@depaul.edu

Natalie Olechno, DePaul University, nolechno@depaul.edu

------------------------------

In [3]:
cd C:\\Users\\jclen\\OneDrive\\Desktop\\DSC 672 - Capstone\\Group Project\\Data

C:\Users\jclen\OneDrive\Desktop\DSC 672 - Capstone\Group Project\Data


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [5]:
fileName = 'Validated Data from Heather.xlsx' #original dataset
#fileName = 'Combined Data.xlsx' #contains original data and synthetic data
df = pd.read_excel(fileName)
df.head(5)

Unnamed: 0,programreportid,programdescription,impactarea,genome,outcome,outcomeid
0,587,Free community programming is a pillar of our ...,Arts,Access and Appreciation,Reached Diverse Audiences,155.0
1,780,My Program rule!,Arts,Access and Appreciation,Enhanced Arts Appreciation,67.0
2,877,TBD\xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...,Arts,Access and Appreciation,Enhanced Arts Appreciation,67.0
3,892,VGâ€™s Arts Education Programs yearly engage n...,Arts,Access and Appreciation,Enhanced Arts Appreciation,67.0
4,899,"The Met Corporate Patron Program, led by the M...",Arts,Access and Appreciation,Enhanced Arts Appreciation,67.0


------------------

#### Preprocessing

In [8]:
#Drop all null values
df = df.dropna()
df.count()

programreportid       6442
programdescription    6442
impactarea            6442
genome                6442
outcome               6442
outcomeid             6442
dtype: int64

In [9]:
df['outcomeid'] = df['outcomeid'].astype(str)
df['programreportid'] = df['programreportid'].astype(str)
df.dtypes

programreportid       object
programdescription    object
impactarea            object
genome                object
outcome               object
outcomeid             object
dtype: object

In [10]:
#Remove special characters from program description
import re
df['programdescription'] = [re.sub(r"[^A-Za-z0-9 :.,'-]+", "", x) for x in df['programdescription']]
#Leave only letters, numbers, and some punctutation marks (colon, period, comma, apostraphe, hyphen)

In [11]:
#We noticed some rows that have placeholder values for program description
id_lst =['168', '186', '3238', '3461', '3473', '362', '3677', '3744', '3746', '3770', '3794',
         '4012', '4250', '453', '464', '471', '4815', '6917', '7555', '780', '877']

#Program description is the main input for our model. So, we want to remove the rows with placeholder values
df.drop(df[df['programreportid'].isin(id_lst)].index, inplace=True)
df.count()

programreportid       6421
programdescription    6421
impactarea            6421
genome                6421
outcome               6421
outcomeid             6421
dtype: int64

--------------

#### Split Data in Train/Test

In [14]:
#Split dataset into feature variables and target variable
#X = df.iloc[:, :-1] #everything except the last column
#X = df.iloc[:, 1] #program description
#y = df.iloc[:, -1] #outcome id

In [15]:
#Our dataset has outcome IDs that only appear once
#We need to make sure they are included in the training data for the model
#So, we have to manually separate out the outcome IDs that only appear once
value_counts = df['outcomeid'].value_counts()
rare_classes = value_counts[value_counts == 1].index
rare_data = df[df.iloc[:, -1].isin(rare_classes)]
df_main = df[~df.iloc[:, -1].isin(rare_classes)]

In [16]:
#Perform stratified split on the main dataset
X_main = df_main.iloc[:, :-1]
y_main = df_main.iloc[:, -1]
X_train_main, X_test_main, y_train_main, y_test_main = train_test_split(X_main, y_main, test_size=0.2, random_state=42, stratify=y_main)

In [17]:
#Combine the rare classes into the training or test set as needed
X_train = pd.concat([X_train_main, rare_data.iloc[:, :-1]], axis=0)
y_train = pd.concat([y_train_main, rare_data.iloc[:, -1]], axis=0)
X_test = X_test_main
y_test = y_test_main

In [18]:
print('Rows in training data: ',len(X_train))
print('Rows in testing data: ',len(X_test))

Rows in training data:  5150
Rows in testing data:  1271


In [19]:
X_train = X_train.iloc[:, 1]
X_test = X_test.iloc[:, 1]

In [20]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

----------------

#### Encoding

In [23]:
#Define a ColumnTransformer to apply OneHotEncoding to categorical columns
#All columns in X_train are categorical strings
categorical_features = X_train.select_dtypes(include=['object']).columns  # Identify categorical columns

In [24]:
#Apply OneHotEncoder to categorical columns
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough'  #Leave non-categorical columns unchanged
)

-----------

#### Baseline Logistic Regression Model with Pipeline

In [27]:
#Define the Logistic Regression model within a pipeline
#The pipeline first applies the preprocessor (OneHotEncoder) and then fits Logistic Regression model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [28]:
#Train the model
model_pipeline.fit(X_train, y_train)

In [29]:
#Make predictions on the testing data
y_pred = model_pipeline.predict(X_test)

----------------------

#### Evaluate Baseline Model Performance

In [32]:
#Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Accuracy: 0.2722
F1 Score: 0.2885
Precision: 0.4393
Recall: 0.2722


  _warn_prf(average, modifier, msg_start, len(result))


----------------

#### Parameter Tuning Using GridSearch

In [35]:
#Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],  #Regularization strength for Logistic Regression
    'classifier__solver': ['lbfgs', 'liblinear'],  #Solvers to test
    'preprocessor__cat__handle_unknown': ['ignore', 'error']  #Test different strategies for handling unknown categories
}

In [36]:
#Setup the GridSearchCV
grid_search = GridSearchCV(estimator=model_pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

#Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

#Get the best model and hyperparameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")



Fitting 5 folds for each of 12 candidates, totalling 60 fits


 0.22058252        nan 0.34485437        nan 0.34446602        nan]


Best parameters: {'classifier__C': 10.0, 'classifier__solver': 'lbfgs', 'preprocessor__cat__handle_unknown': 'ignore'}
Best cross-validation score: 0.3448543689320388


In [37]:
# Test the model with the best parameters
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test set score: {test_score}")

Test set score: 0.38552321007081036


------------

#### Evaluate Logistic Regression Model Performance After Parameter Tuning

In [40]:
#make predictions on test data
y_pred2 = best_model.predict(X_test)

In [41]:
#Calculate evaluation metrics
accuracy_tune = accuracy_score(y_test, y_pred2)
f1_tune = f1_score(y_test, y_pred2, average='weighted')
precision_tune = precision_score(y_test, y_pred2, average='weighted')
recall_tune = recall_score(y_test, y_pred2, average='weighted')

print(f'Accuracy: {accuracy_tune:.4f}')
print(f'F1 Score: {f1_tune:.4f}')
print(f'Precision: {precision_tune:.4f}')
print(f'Recall: {recall_tune:.4f}')

Accuracy: 0.3855
F1 Score: 0.4130
Precision: 0.5274
Recall: 0.3855


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
