## IE 7300 Statistical Learning
## Aditi Chadha
## Part 2

In [1]:
# Import necessary libraries
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import time

# Please upload file 'data_after_EDA.csv' (attached) or read the file from local - to run the code seamlessly

In [2]:
from google.colab import files
uploaded = files.upload()

Saving data_after_EDA.csv to data_after_EDA.csv


In [3]:
data = pd.read_csv('data_after_EDA.csv')
data

Unnamed: 0,n_tokens_title,n_tokens_content,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,average_token_length,kw_avg_min,kw_min_max,kw_avg_avg,self_reference_min_shares,...,data_channel_is_entertainment,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,is_weekend,shares_c
0,12.0,219.0,0.815385,4.0,2.0,4.680365,0.000,0.0,820.944678,496.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,9.0,255.0,0.791946,3.0,1.0,4.913725,0.000,0.0,820.944678,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,9.0,211.0,0.663866,3.0,1.0,4.393365,0.000,0.0,809.211821,918.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,9.0,531.0,0.665635,9.0,0.0,4.404896,0.000,0.0,820.944678,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,13.0,1072.0,0.540890,19.0,3.1,4.682836,0.000,0.0,1228.468513,545.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39638,11.0,346.0,0.684783,9.0,7.0,4.523121,173.125,3231.8,3031.115764,1818.4,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
39639,12.0,328.0,0.885057,9.0,7.0,4.405488,184.000,6500.0,3411.660830,2100.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
39640,10.0,442.0,0.644128,24.0,1.0,5.076923,168.250,6200.0,4206.439195,1400.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
39641,6.0,682.0,0.692661,10.0,1.0,4.975073,-1.000,0.0,1777.895883,452.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [4]:
categorical_cols = ['num_imgs', 'num_videos', 'num_keywords', 'LDA']
encoded_data = pd.get_dummies(data, columns=categorical_cols, drop_first=True, dtype=int)
encoded_data = encoded_data.astype(float)
X = encoded_data.drop(columns=['shares_c'])
y = encoded_data['shares_c']


train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Scale the features (optional, but often beneficial for logistic regression)
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In this model building, we tried different ways of building the Neural Network model by adding more epochs, adding more layers, activation functions etc

* Training:
Compile: All three models are compiled with the same optimizer (Adam), loss function (binary cross-entropy for binary classification), and evaluation metric (accuracy).

* Fit (Training): Each model is trained using the fit method with the training data (train_X, train_y) for 10 epochs, using batches of 32 samples. Validation data (test_X, test_y) is provided to evaluate the model's performance after each epoch.

* Evaluation:  After training each model, the code evaluates the model's performance on the test set (test_X, test_y) using the evaluate method. It computes the loss and accuracy of the model on unseen data.

In [5]:
# Custom F1 Score Metric
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        precision_result = self.precision.result()
        recall_result = self.recall.result()

        # F1 Score calculation
        f1 = 2 * (precision_result * recall_result) / (precision_result + recall_result + 1e-15)
        return f1

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

### Model 0: Epochs-50, Batch Size-32

In [6]:
# Define the model

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(train_X.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
start_time = time.time()
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_X, train_y, epochs=50, batch_size=32, validation_data=(test_X, test_y),verbose=0)

# Evaluate the model and print metrics
y_pred = model.predict(test_X)
y_pred_binary = (y_pred > 0.5).astype(int)

end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print("\n")
print("**Execution time: {:.4f} seconds**\n".format(execution_time))
print("\n")
# Evaluate the model and print metrics
y_pred_X = model.predict(train_X)
y_pred_binary_X = (y_pred_X > 0.5).astype(int)

precision = precision_score(train_y, y_pred_binary_X)
recall = recall_score(train_y, y_pred_binary_X)
accuracy = accuracy_score(train_y, y_pred_binary_X)
f1 = f1_score(train_y, y_pred_binary_X)

print("Metrics for training:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\n")
precision = precision_score(test_y, y_pred_binary)
recall = recall_score(test_y, y_pred_binary)
accuracy = accuracy_score(test_y, y_pred_binary)
f1 = f1_score(test_y, y_pred_binary)

print("Metrics for testing:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")



**Execution time: 76.4545 seconds**



Metrics for training:

Precision: 0.7873059144245584
Recall: 0.7850726864436828
Accuracy: 0.7721511004603645
F1 Score: 0.7861877145224286


Metrics for testing:

Precision: 0.6468044209514656
Recall: 0.6362562042070432
Accuracy: 0.620506999621642
F1 Score: 0.6414869534135589


It seems that the model might be slightly overfitting to the training data, showing a tradeoff where it fits the training data well (low bias) but struggles to generalize to new data (high variance). This can be said because the drop in performance metrics (lower precision, recall, accuracy, and F1 score) on the testing set compared to the training set suggests an increase in variance. The model might be overly sensitive to the specificities of the training data and fails to generalize well to unseen data, indicating higher variance or overfitting

### Model 1 : Epochs: 100, Batch Size:32

In [7]:
model1 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='sigmoid', input_shape=(train_X.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


start_time = time.time()

# Compile the model
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',F1Score()])

# Train the model
model1.fit(train_X, train_y, epochs=100, batch_size=32, validation_data=(test_X, test_y),verbose=0)

# Evaluate the model and print metrics
# Evaluate the model and print metrics
y_pred = model1.predict(test_X)
y_pred_binary = (y_pred > 0.5).astype(int)

end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print("\n")
print("**Execution time: {:.4f} seconds**\n".format(execution_time))
print("\n")
# Evaluate the model and print metrics
y_pred_X = model1.predict(train_X)
y_pred_binary_X = (y_pred_X > 0.5).astype(int)

precision = precision_score(train_y, y_pred_binary_X)
recall = recall_score(train_y, y_pred_binary_X)
accuracy = accuracy_score(train_y, y_pred_binary_X)
f1 = f1_score(train_y, y_pred_binary_X)

print("Metrics for training:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\n")
precision = precision_score(test_y, y_pred_binary)
recall = recall_score(test_y, y_pred_binary)
accuracy = accuracy_score(test_y, y_pred_binary)
f1 = f1_score(test_y, y_pred_binary)

print("Metrics for testing:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

  m.reset_state()




**Execution time: 143.2638 seconds**



Metrics for training:

Precision: 0.7902093313209528
Recall: 0.7116180120553126
Accuracy: 0.7453175253831116
F1 Score: 0.7488573116507571


Metrics for testing:

Precision: 0.6661392405063291
Recall: 0.597021980619239
Accuracy: 0.6252995333585571
F1 Score: 0.6296896422784495


The model shows a reasonable level of bias as indicated by relatively high precision, recall, accuracy, and F1 score on the training set. It suggests the model fits the training data well and captures its pattern. However, there is a notable drop in performance on the testing set compared to the training set. The decrease in precision, recall, accuracy, and F1 score indicates a potential issue with variance or overfitting.

### Model 1.1 : Epochs: 100, Batch Size:32 - with dropout regularization

In [8]:
model1_1 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(train_X.shape[1],)),
    tf.keras.layers.Dropout(0.2),  # Add dropout
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),  # Add dropout
    tf.keras.layers.Dense(1, activation='sigmoid')
])

start_time = time.time()
# Compile the model
model1_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',F1Score()])

# Train the model
model1_1.fit(train_X, train_y, epochs=100, batch_size=32, validation_data=(test_X, test_y),verbose=0)

y_pred = model1_1.predict(test_X)
y_pred_binary = (y_pred > 0.5).astype(int)

end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print("\n")
print("**Execution time: {:.4f} seconds**\n".format(execution_time))
print("\n")
# Evaluate the model and print metrics
y_pred_X = model1_1.predict(train_X)
y_pred_binary_X = (y_pred_X > 0.5).astype(int)

precision = precision_score(train_y, y_pred_binary_X)
recall = recall_score(train_y, y_pred_binary_X)
accuracy = accuracy_score(train_y, y_pred_binary_X)
f1 = f1_score(train_y, y_pred_binary_X)

print("Metrics for training:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\n")
precision = precision_score(test_y, y_pred_binary)
recall = recall_score(test_y, y_pred_binary)
accuracy = accuracy_score(test_y, y_pred_binary)
f1 = f1_score(test_y, y_pred_binary)

print("Metrics for testing:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

  m.reset_state()




**Execution time: 145.3211 seconds**



Metrics for training:

Precision: 0.7123344157517654
Recall: 0.8166883347122089
Accuracy: 0.7262092451283345
F1 Score: 0.760950362029568


Metrics for testing:

Precision: 0.6450413223140495
Recall: 0.7378870243441267
Accuracy: 0.6434607138352881
F1 Score: 0.6883474809833536


### Model 2: epochs -150, Batch Size - 64

In [9]:
model2 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(train_X.shape[1],)),
    tf.keras.layers.Dense(64, activation='tanh'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


# Compile the model
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',F1Score()])

start_time = time.time()
# Train the model
model2.fit(train_X, train_y, epochs=150, batch_size=64, validation_data=(test_X, test_y),verbose=0)


y_pred = model2.predict(test_X)
y_pred_binary = (y_pred > 0.5).astype(int)

end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print("\n")
print("**Execution time: {:.4f} seconds**\n".format(execution_time))
print("\n")
# Evaluate the model and print metrics
y_pred_X = model2.predict(train_X)
y_pred_binary_X = (y_pred_X > 0.5).astype(int)

precision = precision_score(train_y, y_pred_binary_X)
recall = recall_score(train_y, y_pred_binary_X)
accuracy = accuracy_score(train_y, y_pred_binary_X)
f1 = f1_score(train_y, y_pred_binary_X)

print("Metrics for training:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\n")
precision = precision_score(test_y, y_pred_binary)
recall = recall_score(test_y, y_pred_binary)
accuracy = accuracy_score(test_y, y_pred_binary)
f1 = f1_score(test_y, y_pred_binary)

print("Metrics for testing:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

  m.reset_state()




**Execution time: 130.4576 seconds**



Metrics for training:

Precision: 0.9293795235752934
Recall: 0.7815861009336957
Accuracy: 0.8517689348552689
F1 Score: 0.8490996051744615


Metrics for testing:

Precision: 0.6531196336576989
Recall: 0.5393523989600567
Accuracy: 0.6013368646739816
F1 Score: 0.5908090614886732


Complex Architecture: Multi-layered model with varying activation functions.
Overfitting Indication: Discrepancy between training and testing metrics suggests the model might be too complex, leading to overfitting.




### Model 3: Epoch 150, Batch Size 128

In [10]:
model3 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(train_X.shape[1],)),
    tf.keras.layers.Dense(64, activation='tanh'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


# Compile the model
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',F1Score()])

start_time = time.time()
# Train the model
model3.fit(train_X, train_y, epochs=150, batch_size=128, validation_data=(test_X, test_y),verbose=0)



y_pred = model3.predict(test_X)
y_pred_binary = (y_pred > 0.5).astype(int)

end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print("\n")
print("**Execution time: {:.4f} seconds**\n".format(execution_time))
print("\n")
# Evaluate the model and print metrics
y_pred_X = model3.predict(train_X)
y_pred_binary_X = (y_pred_X > 0.5).astype(int)

precision = precision_score(train_y, y_pred_binary_X)
recall = recall_score(train_y, y_pred_binary_X)
accuracy = accuracy_score(train_y, y_pred_binary_X)
f1 = f1_score(train_y, y_pred_binary_X)

print("Metrics for training:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\n")
precision = precision_score(test_y, y_pred_binary)
recall = recall_score(test_y, y_pred_binary)
accuracy = accuracy_score(test_y, y_pred_binary)
f1 = f1_score(test_y, y_pred_binary)

print("Metrics for testing:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

  m.reset_state()




**Execution time: 83.1442 seconds**



Metrics for training:

Precision: 0.7926429960094137
Recall: 0.9155537170547217
Accuracy: 0.827142586870152
F1 Score: 0.8496764286497751


Metrics for testing:

Precision: 0.6257466529351184
Recall: 0.7180335618057196
Accuracy: 0.6203808803127759
F1 Score: 0.6687211093990755


### Model 4: Epochs : 150, Batch Size : 256

In [11]:
model4 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(train_X.shape[1],)),
    tf.keras.layers.Dense(64, activation='tanh'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


# Compile the model
model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',F1Score()])

start_time = time.time()
# Train the model
model4.fit(train_X, train_y, epochs=150, batch_size=256, validation_data=(test_X, test_y),verbose=0)

end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print("\n")
print("**Execution time: {:.4f} seconds**\n".format(execution_time))
print("\n")
# Evaluate the model and print metrics
y_pred_X = model4.predict(train_X)
y_pred_binary_X = (y_pred_X > 0.5).astype(int)

precision = precision_score(train_y, y_pred_binary_X)
recall = recall_score(train_y, y_pred_binary_X)
accuracy = accuracy_score(train_y, y_pred_binary_X)
f1 = f1_score(train_y, y_pred_binary_X)

print("Metrics for training:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\n")
precision = precision_score(test_y, y_pred_binary)
recall = recall_score(test_y, y_pred_binary)
accuracy = accuracy_score(test_y, y_pred_binary)
f1 = f1_score(test_y, y_pred_binary)

print("Metrics for testing:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

  m.reset_state()




**Execution time: 52.1370 seconds**



Metrics for training:

Precision: 0.8108488706134175
Recall: 0.850667769767167
Accuracy: 0.8144352651825693
F1 Score: 0.8302811824080751


Metrics for testing:

Precision: 0.6257466529351184
Recall: 0.7180335618057196
Accuracy: 0.6203808803127759
F1 Score: 0.6687211093990755


### Model 5 - Epochs : 150, Batch Size : 512

In [12]:
model5 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(train_X.shape[1],)),
    tf.keras.layers.Dense(64, activation='tanh'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


# Compile the model
model5.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',F1Score()])

start_time = time.time()
# Train the model
model5.fit(train_X, train_y, epochs=150, batch_size=512, validation_data=(test_X, test_y),verbose=0)

end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print("\n")
print("**Execution time: {:.4f} seconds**\n".format(execution_time))
print("\n")
# Evaluate the model and print metrics
y_pred_X = model5.predict(train_X)
y_pred_binary_X = (y_pred_X > 0.5).astype(int)

precision = precision_score(train_y, y_pred_binary_X)
recall = recall_score(train_y, y_pred_binary_X)
accuracy = accuracy_score(train_y, y_pred_binary_X)
f1 = f1_score(train_y, y_pred_binary_X)

print("Metrics for training:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\n")
precision = precision_score(test_y, y_pred_binary)
recall = recall_score(test_y, y_pred_binary)
accuracy = accuracy_score(test_y, y_pred_binary)
f1 = f1_score(test_y, y_pred_binary)

print("Metrics for testing:\n")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

  m.reset_state()




**Execution time: 34.7548 seconds**



Metrics for training:

Precision: 0.8763484953630686
Recall: 0.8208840562581255
Accuracy: 0.8426247083307057
F1 Score: 0.8477100051871969


Metrics for testing:

Precision: 0.6257466529351184
Recall: 0.7180335618057196
Accuracy: 0.6203808803127759
F1 Score: 0.6687211093990755
