In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


https://github.com/halpert3/complaint-content-classification-nlp/tree/main/notebooks

# Import libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from statistics import mean


In [3]:
df = pd.read_csv('/content/drive/MyDrive/MS DATA SCIENCE /TESE /data_processed.csv')

# Multiclass Classification and Holdout CV

In [None]:
#convert reason to numbers
reason_dict ={'Mau Serviço Prestado': 0, 'Condições de entrega': 1, 'Atraso de entrega': 2, 
               'Enganos': 3}
df['reason'].replace(reason_dict, inplace=True)

In [None]:
df['reason'].unique()

array([0, 1, 2, 3])

In [None]:
X = df['narrative_tfidf']
y = df['reason']

In [None]:
for item in Counter(y).items():
  print(item[0], item[1]/len(y))

0 0.3551493535443602
1 0.09893000445831476
2 0.4782880071333036
3 0.0676326348640214


# Train test split

In [None]:
# we should always split before transforming the data
X_train, X_test, y_train_tf, y_test_tf = train_test_split(X, y.values, test_size=0.20, random_state=200)

In [None]:
# print(Counter(y_train_tf))
for item in Counter(y_train_tf).items():
  print(item[0], item[1]/len(y_train_tf))

0 0.35644226482389657
3 0.06754346856888097
2 0.4772625947391886
1 0.09875167186803388


In [None]:
for item in Counter(y_test_tf).items():
  print(item[0], item[1]/len(y_test_tf))

0 0.3499777084262149
2 0.4823896567097637
1 0.09964333481943825
3 0.06798930004458315


In [4]:
def training (classifier,X_train_tf, X_test_tf, y_train_tf, y_test_tf):
    classifier = classifier
    # Train Decision Tree Classifer
    classifier.fit(X_train_tf, y_train_tf)
   
    # TRAIN
    #Predict the response for train dataset
    y_pred = classifier.predict(X_train_tf)

    # TEST
    #Predict the response for test dataset
    test_y_pred = classifier.predict(X_test_tf)

    cf=confusion_matrix(y_test_tf, test_y_pred)

    return accuracy_score(y_train_tf, y_pred),classification_report(y_test_tf, test_y_pred), cf

## Unigrams only

In [None]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf=X_train_tf.toarray()
X_test_tf= vectorizer.transform(X_test)

In [None]:
feature_names =  vectorizer.get_feature_names_out()
len(feature_names)

22592

In [None]:
X_test_tf

<4486x22592 sparse matrix of type '<class 'numpy.float64'>'
	with 234900 stored elements in Compressed Sparse Row format>

### KNN

In [None]:
acc_train,cm=training(KNeighborsClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7043580026749888
              precision    recall  f1-score   support

           0       0.52      0.56      0.54      1570
           1       0.20      0.10      0.13       447
           2       0.67      0.76      0.71      2164
           3       0.31      0.13      0.18       305

    accuracy                           0.58      4486
   macro avg       0.42      0.38      0.39      4486
weighted avg       0.54      0.58      0.56      4486



### DT

In [None]:
acc_train,cm=training(DecisionTreeClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9991640659830584
              precision    recall  f1-score   support

           0       0.44      0.46      0.45      1570
           1       0.13      0.12      0.12       447
           2       0.64      0.63      0.64      2164
           3       0.18      0.16      0.17       305

    accuracy                           0.49      4486
   macro avg       0.35      0.34      0.34      4486
weighted avg       0.49      0.49      0.49      4486



### Random Forest

In [None]:
acc_train,cm=training(RandomForestClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9991640659830584
              precision    recall  f1-score   support

           0       0.60      0.58      0.59      1570
           1       0.62      0.01      0.02       447
           2       0.66      0.91      0.77      2164
           3       0.50      0.02      0.03       305

    accuracy                           0.64      4486
   macro avg       0.60      0.38      0.35      4486
weighted avg       0.63      0.64      0.58      4486



### Gradient Boosting

In [None]:
acc_train,cm=training(GradientBoostingClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6851315202853322
              precision    recall  f1-score   support

           0       0.57      0.63      0.60      1570
           1       0.42      0.04      0.07       447
           2       0.70      0.85      0.77      2164
           3       0.48      0.12      0.19       305

    accuracy                           0.64      4486
   macro avg       0.54      0.41      0.41      4486
weighted avg       0.61      0.64      0.60      4486



### XGBoost

In [None]:
acc_train,cm=training(XGBClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8525969683459652
              precision    recall  f1-score   support

           0       0.56      0.64      0.60      1570
           1       0.39      0.09      0.15       447
           2       0.72      0.83      0.77      2164
           3       0.50      0.20      0.28       305

    accuracy                           0.65      4486
   macro avg       0.54      0.44      0.45      4486
weighted avg       0.62      0.65      0.62      4486



### NB

In [None]:
classifier = MultinomialNB()
# Train Decision Tree Classifer
classifier.fit(X_train_tf, y_train_tf)

# TRAIN
#Predict the response for train dataset
y_pred = classifier.predict(X_train_tf)

# TEST
#Predict the response for test dataset
test_y_pred = classifier.predict(X_test_tf)

print(accuracy_score(y_train_tf, y_pred))
print(classification_report(y_test_tf, test_y_pred, zero_division=1))

0.6516941596076683
              precision    recall  f1-score   support

           0       0.60      0.51      0.55      1570
           1       1.00      0.00      0.00       447
           2       0.63      0.92      0.75      2164
           3       1.00      0.00      0.00       305

    accuracy                           0.62      4486
   macro avg       0.81      0.36      0.32      4486
weighted avg       0.68      0.62      0.55      4486



In [None]:
acc_train,cm=training(MultinomialNB(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6516941596076683
              precision    recall  f1-score   support

           0       0.60      0.51      0.55      1570
           1       0.00      0.00      0.00       447
           2       0.63      0.92      0.75      2164
           3       0.00      0.00      0.00       305

    accuracy                           0.62      4486
   macro avg       0.31      0.36      0.32      4486
weighted avg       0.51      0.62      0.55      4486



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### SVM

In [None]:
acc_train,cm=training(SVC(kernel='linear'),X_train_tf, X_test_tf.toarray(), y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7467119928666964
              precision    recall  f1-score   support

           0       0.56      0.68      0.61      1570
           1       0.34      0.03      0.05       447
           2       0.73      0.84      0.78      2164
           3       0.48      0.14      0.22       305

    accuracy                           0.65      4486
   macro avg       0.53      0.42      0.42      4486
weighted avg       0.62      0.65      0.61      4486



In [None]:
acc_train,cm=training(LinearSVC(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8576683013820776
              precision    recall  f1-score   support

           0       0.57      0.61      0.59      1570
           1       0.30      0.11      0.16       447
           2       0.71      0.82      0.76      2164
           3       0.38      0.19      0.25       305

    accuracy                           0.63      4486
   macro avg       0.49      0.43      0.44      4486
weighted avg       0.60      0.63      0.61      4486



## Unigrams and Max features 10.000

In [None]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=10000)
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf=X_train_tf.toarray()
X_test_tf= vectorizer.transform(X_test)

In [None]:
acc_train,cm=training(KNeighborsClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7047481052162282
              precision    recall  f1-score   support

           0       0.51      0.56      0.53      1570
           1       0.22      0.11      0.14       447
           2       0.67      0.74      0.71      2164
           3       0.33      0.13      0.19       305

    accuracy                           0.58      4486
   macro avg       0.43      0.39      0.39      4486
weighted avg       0.54      0.58      0.55      4486



In [None]:
acc_train,cm=training(DecisionTreeClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9991640659830584
              precision    recall  f1-score   support

           0       0.46      0.49      0.48      1570
           1       0.16      0.13      0.14       447
           2       0.65      0.65      0.65      2164
           3       0.21      0.20      0.20       305

    accuracy                           0.51      4486
   macro avg       0.37      0.37      0.37      4486
weighted avg       0.50      0.51      0.51      4486



In [None]:
acc_train,cm=training(RandomForestClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9991640659830584
              precision    recall  f1-score   support

           0       0.58      0.61      0.59      1570
           1       0.67      0.01      0.03       447
           2       0.68      0.88      0.76      2164
           3       0.71      0.02      0.03       305

    accuracy                           0.64      4486
   macro avg       0.66      0.38      0.35      4486
weighted avg       0.64      0.64      0.58      4486



In [None]:
acc_train,cm=training(GradientBoostingClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6852429781542577
              precision    recall  f1-score   support

           0       0.56      0.63      0.60      1570
           1       0.51      0.04      0.08       447
           2       0.70      0.84      0.76      2164
           3       0.53      0.14      0.22       305

    accuracy                           0.64      4486
   macro avg       0.58      0.41      0.41      4486
weighted avg       0.62      0.64      0.60      4486



In [None]:
acc_train,cm=training(XGBClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8514823896567097
              precision    recall  f1-score   support

           0       0.57      0.64      0.60      1570
           1       0.36      0.09      0.14       447
           2       0.72      0.83      0.77      2164
           3       0.49      0.17      0.25       305

    accuracy                           0.65      4486
   macro avg       0.53      0.43      0.44      4486
weighted avg       0.62      0.65      0.61      4486



In [None]:
acc_train,cm=training(MultinomialNB(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6525858225590727
              precision    recall  f1-score   support

           0       0.59      0.57      0.58      1570
           1       0.00      0.00      0.00       447
           2       0.65      0.90      0.76      2164
           3       0.00      0.00      0.00       305

    accuracy                           0.63      4486
   macro avg       0.31      0.37      0.33      4486
weighted avg       0.52      0.63      0.57      4486



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
acc_train,cm=training(LinearSVC(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8260142666072224
              precision    recall  f1-score   support

           0       0.57      0.61      0.59      1570
           1       0.30      0.11      0.16       447
           2       0.71      0.82      0.76      2164
           3       0.37      0.19      0.25       305

    accuracy                           0.63      4486
   macro avg       0.49      0.43      0.44      4486
weighted avg       0.60      0.63      0.61      4486



## Unigrams and Max features 5.000

In [None]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf=X_train_tf.toarray()
X_test_tf= vectorizer.transform(X_test)

In [None]:
acc_train,cm=training(KNeighborsClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7027418635755684
              precision    recall  f1-score   support

           0       0.51      0.57      0.53      1570
           1       0.24      0.11      0.15       447
           2       0.67      0.74      0.70      2164
           3       0.33      0.14      0.19       305

    accuracy                           0.58      4486
   macro avg       0.44      0.39      0.40      4486
weighted avg       0.55      0.58      0.56      4486



In [None]:
acc_train,cm=training(DecisionTreeClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9991640659830584
              precision    recall  f1-score   support

           0       0.46      0.50      0.48      1570
           1       0.18      0.16      0.17       447
           2       0.65      0.64      0.64      2164
           3       0.19      0.15      0.17       305

    accuracy                           0.51      4486
   macro avg       0.37      0.36      0.37      4486
weighted avg       0.50      0.51      0.51      4486



In [None]:
acc_train,cm=training(RandomForestClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9991640659830584
              precision    recall  f1-score   support

           0       0.58      0.62      0.60      1570
           1       0.50      0.01      0.02       447
           2       0.69      0.89      0.77      2164
           3       0.67      0.02      0.04       305

    accuracy                           0.65      4486
   macro avg       0.61      0.38      0.36      4486
weighted avg       0.63      0.65      0.59      4486



In [None]:
acc_train,cm=training(GradientBoostingClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6868591172536781
              precision    recall  f1-score   support

           0       0.56      0.63      0.59      1570
           1       0.50      0.04      0.07       447
           2       0.70      0.84      0.77      2164
           3       0.47      0.11      0.18       305

    accuracy                           0.64      4486
   macro avg       0.56      0.41      0.40      4486
weighted avg       0.62      0.64      0.60      4486



In [None]:
acc_train,cm=training(XGBClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.854881854658939
              precision    recall  f1-score   support

           0       0.55      0.63      0.59      1570
           1       0.35      0.07      0.12       447
           2       0.72      0.83      0.77      2164
           3       0.44      0.17      0.24       305

    accuracy                           0.64      4486
   macro avg       0.52      0.42      0.43      4486
weighted avg       0.60      0.64      0.60      4486



In [None]:
acc_train,cm=training(MultinomialNB(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6486290682122158
              precision    recall  f1-score   support

           0       0.58      0.59      0.59      1570
           1       0.00      0.00      0.00       447
           2       0.66      0.89      0.76      2164
           3       0.67      0.01      0.01       305

    accuracy                           0.63      4486
   macro avg       0.48      0.37      0.34      4486
weighted avg       0.57      0.63      0.57      4486



In [None]:
acc_train,cm=training(LinearSVC(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7832144449398127
              precision    recall  f1-score   support

           0       0.57      0.61      0.59      1570
           1       0.28      0.10      0.15       447
           2       0.71      0.81      0.76      2164
           3       0.38      0.20      0.26       305

    accuracy                           0.63      4486
   macro avg       0.49      0.43      0.44      4486
weighted avg       0.59      0.63      0.60      4486



## Unigrams and Bigrams and Max features

In [None]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf=X_train_tf.toarray()
X_test_tf= vectorizer.transform(X_test)

In [None]:
acc_train,cm=training(KNeighborsClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6998439589835043
              precision    recall  f1-score   support

           0       0.51      0.61      0.55      1570
           1       0.24      0.12      0.16       447
           2       0.69      0.73      0.71      2164
           3       0.30      0.09      0.14       305

    accuracy                           0.58      4486
   macro avg       0.43      0.39      0.39      4486
weighted avg       0.55      0.58      0.56      4486



In [None]:
acc_train,cm=training(DecisionTreeClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9990526081141329
              precision    recall  f1-score   support

           0       0.46      0.48      0.47      1570
           1       0.16      0.14      0.15       447
           2       0.65      0.67      0.66      2164
           3       0.21      0.17      0.19       305

    accuracy                           0.51      4486
   macro avg       0.37      0.36      0.37      4486
weighted avg       0.51      0.51      0.51      4486



In [None]:
acc_train,cm=training(RandomForestClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9990526081141329
              precision    recall  f1-score   support

           0       0.57      0.62      0.59      1570
           1       0.50      0.01      0.02       447
           2       0.69      0.87      0.77      2164
           3       0.64      0.02      0.04       305

    accuracy                           0.64      4486
   macro avg       0.60      0.38      0.36      4486
weighted avg       0.62      0.64      0.58      4486



In [None]:
acc_train,cm=training(GradientBoostingClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6914288898796255
              precision    recall  f1-score   support

           0       0.57      0.63      0.59      1570
           1       0.42      0.04      0.07       447
           2       0.70      0.85      0.77      2164
           3       0.44      0.12      0.19       305

    accuracy                           0.64      4486
   macro avg       0.53      0.41      0.41      4486
weighted avg       0.61      0.64      0.60      4486



In [None]:
acc_train,cm=training(XGBClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8622938029424877
              precision    recall  f1-score   support

           0       0.57      0.65      0.61      1570
           1       0.33      0.07      0.11       447
           2       0.72      0.84      0.78      2164
           3       0.45      0.17      0.24       305

    accuracy                           0.65      4486
   macro avg       0.52      0.43      0.44      4486
weighted avg       0.61      0.65      0.62      4486



In [None]:
acc_train,cm=training(MultinomialNB(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.6524186357556844
              precision    recall  f1-score   support

           0       0.57      0.63      0.60      1570
           1       0.33      0.00      0.01       447
           2       0.69      0.87      0.77      2164
           3       0.62      0.03      0.06       305

    accuracy                           0.64      4486
   macro avg       0.55      0.38      0.36      4486
weighted avg       0.61      0.64      0.59      4486



In [None]:
acc_train,cm=training(LinearSVC(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.802329469460544
              precision    recall  f1-score   support

           0       0.58      0.59      0.59      1570
           1       0.32      0.15      0.20       447
           2       0.71      0.82      0.76      2164
           3       0.37      0.21      0.27       305

    accuracy                           0.63      4486
   macro avg       0.50      0.44      0.46      4486
weighted avg       0.60      0.63      0.61      4486



# Binary Classification and Holdout CV

In [5]:
df = pd.read_csv('/content/drive/MyDrive/MS DATA SCIENCE /TESE /data_processed.csv')

In [6]:
#convert reason to numbers
reason_dict ={'Mau Serviço Prestado': 0, 'Condições de entrega': 0, 'Atraso de entrega': 1, 
               'Enganos': 0}

df['reason'].replace(reason_dict, inplace=True)

In [7]:
df['reason'].unique()

array([0, 1])

In [None]:
df['reason'].value_counts()

0    11702
1    10728
Name: reason, dtype: int64

In [8]:
X = df['narrative_tfidf']
y = df['reason']

In [None]:
for item in Counter(y).items():
  print(item[0], item[1]/len(y))

0 0.5217119928666963
1 0.4782880071333036


# Train test split

In [9]:
# we should always split before transforming the data
X_train, X_test, y_train_tf, y_test_tf = train_test_split(X, y.values, test_size=0.20, random_state=200)

In [None]:
# print(Counter(y_train_tf))
for item in Counter(y_train_tf).items():
  print(item[0], item[1]/len(y_train_tf))

0 0.5227374052608114
1 0.4772625947391886


In [None]:
# print(Counter(y_train_tf))
for item in Counter(y_test_tf).items():
  print(item[0], item[1]/len(y_test_tf))

0 0.5176103432902363
1 0.4823896567097637


## Unigrams only and max features

In [10]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf=X_train_tf.toarray()
X_test_tf= vectorizer.transform(X_test)

In [None]:
acc_train,cm,cf=training(KNeighborsClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)
print(cf)

In [None]:
acc_train,cm=training(DecisionTreeClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9992755238519839
              precision    recall  f1-score   support

           0       0.67      0.67      0.67      2322
           1       0.64      0.64      0.64      2164

    accuracy                           0.66      4486
   macro avg       0.66      0.66      0.66      4486
weighted avg       0.66      0.66      0.66      4486



In [None]:
acc_train,cm=training(RandomForestClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9992755238519839
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      2322
           1       0.76      0.76      0.76      2164

    accuracy                           0.77      4486
   macro avg       0.77      0.77      0.77      4486
weighted avg       0.77      0.77      0.77      4486



In [None]:
acc_train,cm=training(GradientBoostingClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7796477931341953
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      2322
           1       0.75      0.75      0.75      2164

    accuracy                           0.76      4486
   macro avg       0.76      0.76      0.76      4486
weighted avg       0.76      0.76      0.76      4486



In [None]:
acc_train,cm=training(XGBClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8908270173874275
              precision    recall  f1-score   support

           0       0.78      0.77      0.78      2322
           1       0.76      0.76      0.76      2164

    accuracy                           0.77      4486
   macro avg       0.77      0.77      0.77      4486
weighted avg       0.77      0.77      0.77      4486



In [None]:
acc_train,cm=training(MultinomialNB(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7805394560855996
              precision    recall  f1-score   support

           0       0.78      0.73      0.76      2322
           1       0.73      0.78      0.76      2164

    accuracy                           0.76      4486
   macro avg       0.76      0.76      0.76      4486
weighted avg       0.76      0.76      0.76      4486



In [None]:
acc_train,cm=training(LinearSVC(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8529870708872046
              precision    recall  f1-score   support

           0       0.77      0.76      0.77      2322
           1       0.75      0.76      0.75      2164

    accuracy                           0.76      4486
   macro avg       0.76      0.76      0.76      4486
weighted avg       0.76      0.76      0.76      4486



## Unigrams and bigrams and max features

In [10]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf=X_train_tf.toarray()
X_test_tf= vectorizer.transform(X_test)

In [11]:
acc_train,cm,cf=training(KNeighborsClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)
print(cf)

acc train:  0.8085153811859117
              precision    recall  f1-score   support

           0       0.73      0.72      0.72      2322
           1       0.70      0.71      0.71      2164

    accuracy                           0.71      4486
   macro avg       0.71      0.71      0.71      4486
weighted avg       0.71      0.71      0.71      4486

[[1668  654]
 [ 629 1535]]


In [None]:
acc_train,cm=training(DecisionTreeClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9991640659830584
              precision    recall  f1-score   support

           0       0.66      0.67      0.67      2322
           1       0.64      0.63      0.64      2164

    accuracy                           0.65      4486
   macro avg       0.65      0.65      0.65      4486
weighted avg       0.65      0.65      0.65      4486



In [None]:
acc_train,cm=training(RandomForestClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.9991640659830584
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      2322
           1       0.76      0.76      0.76      2164

    accuracy                           0.77      4486
   macro avg       0.77      0.77      0.77      4486
weighted avg       0.77      0.77      0.77      4486



In [None]:
acc_train,cm=training(GradientBoostingClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7840503789567543
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      2322
           1       0.74      0.75      0.75      2164

    accuracy                           0.76      4486
   macro avg       0.76      0.76      0.76      4486
weighted avg       0.76      0.76      0.76      4486



In [None]:
acc_train,cm=training(XGBClassifier(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8931119037004013
              precision    recall  f1-score   support

           0       0.78      0.77      0.77      2322
           1       0.75      0.77      0.76      2164

    accuracy                           0.77      4486
   macro avg       0.77      0.77      0.77      4486
weighted avg       0.77      0.77      0.77      4486



In [None]:
acc_train,cm=training(MultinomialNB(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.7801493535443602
              precision    recall  f1-score   support

           0       0.79      0.73      0.76      2322
           1       0.73      0.80      0.76      2164

    accuracy                           0.76      4486
   macro avg       0.76      0.76      0.76      4486
weighted avg       0.77      0.76      0.76      4486



In [None]:
acc_train,cm=training(LinearSVC(),X_train_tf, X_test_tf, y_train_tf, y_test_tf)
print('acc train: ',acc_train)
print(cm)

acc train:  0.8661948283548818
              precision    recall  f1-score   support

           0       0.78      0.75      0.77      2322
           1       0.74      0.77      0.76      2164

    accuracy                           0.76      4486
   macro avg       0.76      0.76      0.76      4486
weighted avg       0.76      0.76      0.76      4486

