## Bhuvnesh Sahu 
### MDS202316

In [55]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [47]:
import warnings
warnings.filterwarnings("ignore")

In [48]:
train = pd.read_csv('train.csv')
validation = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

In [49]:
train = train.dropna()
validation = validation.dropna()
test = test.dropna()

In [50]:
train

Unnamed: 0,target,num_characters,num_words,text
0,0,80,18,yup hey then one day on fri we can ask miwa an...
1,0,34,8,have you ever had one foot befor
2,0,107,29,cud u tell ppl im gona b a bit l8 co 2 buse ha...
3,0,327,84,hey babe sorri i did get sooner gari can come ...
4,1,148,34,hi custom loyalti offer the new nokia6650 mobi...
...,...,...,...,...
3716,0,99,21,it hard to believ thing like thi all can say l...
3717,0,23,6,ok leav no need to ask
3718,1,159,27,u can win of music gift voucher everi week sta...
3719,0,113,30,huh hyde park not in mel ah opp got confus any...


In [51]:
for name, df in zip(["Train", "Validation", "Test"], [train, validation, test]):
    print(f"\n{name} label distribution:\n{df['target'].value_counts()}")



Train label distribution:
target
0    3248
1     470
Name: count, dtype: int64

Validation label distribution:
target
0    362
1     52
Name: count, dtype: int64

Test label distribution:
target
0    902
1    131
Name: count, dtype: int64


In [52]:
# Vectorizing the text data using TF-IDF
def vectorize_data(train_data, validation_data, test_data):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
    X_train = vectorizer.fit_transform(train_data['text']).toarray()
    X_validation = vectorizer.transform(validation_data['text']).toarray()
    X_test = vectorizer.transform(test_data['text']).toarray()
    return X_train, X_validation, X_test, vectorizer

X_train, X_validation, X_test, vectorizer = vectorize_data(train, validation, test)

In [53]:
train_extra_features = train[['num_characters', 'num_words']]
validation_extra_features = validation[['num_characters', 'num_words']]
test_extra_features = test[['num_characters', 'num_words']]

# Horizontally stack the extracted features with the existing vectorized data
X_train = np.hstack((X_train, train_extra_features))
X_validation = np.hstack((X_validation, validation_extra_features))
X_test = np.hstack((X_test, test_extra_features))

In [54]:
X_train.shape

(3718, 3002)

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [57]:
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)

In [58]:
clfs = {
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc
}

## Train on X_train and validate on X_validation

In [59]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [40]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,train['target'],X_validation,validation['target'])
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

For  KN
Accuracy -  0.9299516908212561
Precision -  0.7674418604651163
For  NB
Accuracy -  0.9202898550724637
Precision -  0.9523809523809523
For  DT
Accuracy -  0.927536231884058
Precision -  0.7115384615384616
For  LR
Accuracy -  0.9589371980676329
Precision -  0.972972972972973
For  RF
Accuracy -  0.9685990338164251
Precision -  1.0
For  AdaBoost
Accuracy -  0.961352657004831
Precision -  1.0


In [42]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision
4,RF,0.968599,1.0
5,AdaBoost,0.961353,1.0
3,LR,0.958937,0.972973
1,NB,0.92029,0.952381
0,KN,0.929952,0.767442
2,DT,0.927536,0.711538


## Hyper-parameter Tuning

In [46]:
# Define parameter grids for tuning
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Perform hyperparameter tuning
grid_rf = GridSearchCV(rfc, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_adaboost = GridSearchCV(abc, param_grid_adaboost, cv=5, scoring='accuracy', n_jobs=-1)
grid_lr = GridSearchCV(lrc, param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)

# Fit models on training data
grid_rf.fit(X_train, train['target'])
grid_adaboost.fit(X_train, train['target'])
grid_lr.fit(X_train, train['target'])

# Print best parameters and accuracy
print("Best RF Parameters:", grid_rf.best_params_)
print("Best RF Accuracy:", grid_rf.best_score_)

print("\nBest AdaBoost Parameters:", grid_adaboost.best_params_)
print("Best AdaBoost Accuracy:", grid_adaboost.best_score_)

print("\nBest LR Parameters:", grid_lr.best_params_)
print("Best LR Accuracy:", grid_lr.best_score_)


Best RF Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
Best RF Accuracy: 0.9714919897538316

Best AdaBoost Parameters: {'learning_rate': 1, 'n_estimators': 200}
Best AdaBoost Accuracy: 0.9717597215589227

Best LR Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Best LR Accuracy: 0.9779446880562672


## Scores on Test data

In [61]:
lr = LogisticRegression(C=10, penalty='l2', solver='liblinear')

# Train the model
lr.fit(X_train, train['target'])

# Make predictions
y_pred = lr.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(test['target'], y_pred)
precision = precision_score(test['target'], y_pred)
recall = recall_score(test['target'], y_pred)

# Print results beautifully
print("=" * 40)
print(" Logistic Regression Performance Metrics ")
print("=" * 40)
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print("=" * 40)

 Logistic Regression Performance Metrics 
Accuracy  : 0.9777
Precision : 0.9500
Recall    : 0.8702
