In [2]:
import pandas as pd
df = pd.read_csv('LanguageDetection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [3]:
df.isnull().sum()

Text        0
Language    0
dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

X = df['Text']
y = df['Language']

XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.33, random_state=17)

In [76]:
# LINEAR SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC

lin_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
    ('clf', LinearSVC()),
])

lin_clf.fit(XTrain, yTrain)



In [77]:
lin_pred = lin_clf.predict(XTest)

In [125]:
from sklearn import metrics
print(metrics.confusion_matrix(yTest,lin_pred))

[[138   0   0   0   0   0   0   0   0   0   0   0   3   0   0   0   0]
 [  0 128   0   1   2   0   0   0   0   0   0   0   3   0   3   0   0]
 [  0   0 193   1   0   0   0   0   0   0   0   0   8   0   2   0   0]
 [  0   0   0 438   0   0   0   0   0   0   1   0   2   1   0   0   0]
 [  0   0   2   1 324   0   0   0   2   0   0   0   3   2   0   0   0]
 [  0   1   0   2   0 150   0   0   0   0   0   1   5   0   0   0   0]
 [  0   0   0   1   0   0 110   0   0   0   0   0   6   0   0   0   0]
 [  0   0   0   0   0   0   0  26   0   0   0   0   1   0   0   0   0]
 [  0   0   0   1   0   0   0   0 218   0   0   0   6   2   0   0   0]
 [  0   0   0   0   0   0   0   0   0 108   0   0   8   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 213   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0 231   3   6   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0 241   0   0   0   0]
 [  0   0   0   1   1   0   0   0   2   0   0   1   4 275   0   0   0]
 [  0 

In [126]:
print(metrics.classification_report(yTest,lin_pred))

              precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       141
      Danish       0.97      0.93      0.95       137
       Dutch       0.99      0.95      0.97       204
     English       0.98      0.99      0.98       442
      French       0.99      0.97      0.98       334
      German       1.00      0.94      0.97       159
       Greek       1.00      0.94      0.97       117
       Hindi       1.00      0.96      0.98        27
     Italian       0.98      0.96      0.97       227
     Kannada       1.00      0.93      0.96       116
   Malayalam       1.00      1.00      1.00       213
  Portugeese       0.99      0.96      0.98       240
     Russian       0.78      1.00      0.87       242
     Spanish       0.96      0.97      0.96       284
    Sweedish       0.98      0.97      0.97       231
       Tamil       1.00      0.98      0.99       150
     Turkish       1.00      0.90      0.95       148

    accuracy              

In [127]:
# Naive Bayes
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB


nmb_clf = Pipeline([
    ('cv', CountVectorizer()),
    ('MNB', MultinomialNB()),
])

nmb_clf.fit(XTrain, yTrain)

In [128]:
mnb_pred = nmb_clf.predict(XTest)

In [129]:
from sklearn import metrics
print(metrics.confusion_matrix(yTest,mnb_pred))

[[138   0   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 128   0   6   2   0   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0 198   5   1   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 442   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   2 331   0   0   0   0   0   0   0   0   1   0   0   0]
 [  0   0   0   7   0 151   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0   0   5   0   0 112   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0  26   0   0   0   0   0   0   0   0   0]
 [  0   0   0   4   0   0   0   0 222   0   0   0   0   1   0   0   0]
 [  0   0   0   8   0   0   0   0   0 108   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 213   0   0   0   0   0   0]
 [  0   0   0   3   0   0   0   0   0   0   0 232   0   5   0   0   0]
 [  0   0   0   8   0   0   0   0   0   0   0   0 234   0   0   0   0]
 [  0   0   0   4   1   0   0   0   0   0   0   0   0 279   0   0   0]
 [  0 

In [130]:
print(metrics.classification_report(yTest,mnb_pred))

              precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       141
      Danish       1.00      0.93      0.97       137
       Dutch       0.99      0.97      0.98       204
     English       0.85      1.00      0.92       442
      French       0.98      0.99      0.99       334
      German       1.00      0.95      0.97       159
       Greek       1.00      0.96      0.98       117
       Hindi       1.00      0.96      0.98        27
     Italian       1.00      0.98      0.99       227
     Kannada       1.00      0.93      0.96       116
   Malayalam       1.00      1.00      1.00       213
  Portugeese       1.00      0.97      0.98       240
     Russian       1.00      0.97      0.98       242
     Spanish       0.97      0.98      0.98       284
    Sweedish       0.99      0.98      0.99       231
       Tamil       1.00      0.98      0.99       150
     Turkish       1.00      0.89      0.94       148

    accuracy              

In [131]:
# SVC GENERAL
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn import metrics

svc_clf = Pipeline([
    ('cv', TfidfVectorizer(ngram_range=(1,3))),
    ('SVC', SVC(kernel='linear')),
])

svc_clf.fit(XTrain, yTrain)

svc_pred = svc_clf.predict(XTest)

print(metrics.confusion_matrix(yTest,svc_pred))

[[136   0   0   0   0   0   0   0   0   0   0   0   5   0   0   0   0]
 [  0 124   0   1   1   0   0   0   0   0   0   0   7   0   4   0   0]
 [  0   0 189   1   0   0   0   0   0   0   0   0  12   0   2   0   0]
 [  0   0   0 437   0   0   0   0   0   0   0   0   4   1   0   0   0]
 [  0   0   2   0 325   0   0   0   2   0   0   0   3   2   0   0   0]
 [  0   1   1   2   0 147   0   0   0   0   0   0   6   0   2   0   0]
 [  0   0   0   1   0   0 107   0   0   0   0   0   9   0   0   0   0]
 [  0   0   0   0   0   0   0  21   0   0   0   0   5   1   0   0   0]
 [  0   0   0   1   0   0   0   0 217   0   0   0   7   2   0   0   0]
 [  0   0   0   0   0   0   0   0   0 107   0   0   9   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 206   0   7   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   0   0 227   5   7   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0 241   0   0   0   0]
 [  0   0   0   1   1   0   0   0   3   0   0   1   6 272   0   0   0]
 [  0 

In [132]:
print(metrics.classification_report(yTest,svc_pred))

              precision    recall  f1-score   support

      Arabic       1.00      0.96      0.98       141
      Danish       0.98      0.91      0.94       137
       Dutch       0.98      0.93      0.95       204
     English       0.98      0.99      0.98       442
      French       0.99      0.97      0.98       334
      German       1.00      0.92      0.96       159
       Greek       1.00      0.91      0.96       117
       Hindi       1.00      0.78      0.88        27
     Italian       0.97      0.96      0.96       227
     Kannada       1.00      0.92      0.96       116
   Malayalam       1.00      0.97      0.98       213
  Portugeese       1.00      0.95      0.97       240
     Russian       0.69      1.00      0.81       242
     Spanish       0.95      0.96      0.96       284
    Sweedish       0.97      0.97      0.97       231
       Tamil       1.00      0.97      0.99       150
     Turkish       1.00      0.87      0.93       148

    accuracy              

In [159]:
# Logisitic regression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

log_clf = Pipeline([
    ('cv', TfidfVectorizer()),
    ('LOG', LogisticRegression(max_iter=1000, C=10000, penalty='l1', solver='saga')),
])

log_clf.fit(XTrain, yTrain)

log_pred = log_clf.predict(XTest)

print(metrics.confusion_matrix(yTest,log_pred))

[[135   0   0   0   0   0   0   0   0   6   0   0   0   0   0   0   0]
 [  0 125   0   1   2   0   0   0   0   7   0   0   0   0   2   0   0]
 [  0   0 189   1   0   0   0   0   0  12   0   0   0   0   2   0   0]
 [  0   0   0 436   0   0   0   0   0   5   1   0   0   0   0   0   0]
 [  0   0   2   0 322   0   0   0   2   7   0   0   0   1   0   0   0]
 [  0   2   0   1   0 150   0   0   0   6   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0 106   0   0  10   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  24   0   3   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0 214  10   0   0   0   2   0   0   0]
 [  0   0   0   0   0   0   0   0   0 116   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   4 209   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   6   0 228   0   5   0   0   0]
 [  0   0   0   1   0   0   0   0   0  12   0   0 229   0   0   0   0]
 [  0   0   0   1   1   0   0   0   5   7   0   1   0 269   0   0   0]
 [  0 



In [160]:
print(metrics.classification_report(yTest,log_pred))

              precision    recall  f1-score   support

      Arabic       1.00      0.96      0.98       141
      Danish       0.95      0.91      0.93       137
       Dutch       0.99      0.93      0.96       204
     English       0.98      0.99      0.98       442
      French       0.99      0.96      0.98       334
      German       1.00      0.94      0.97       159
       Greek       1.00      0.91      0.95       117
       Hindi       1.00      0.89      0.94        27
     Italian       0.96      0.94      0.95       227
     Kannada       0.49      1.00      0.66       116
   Malayalam       1.00      0.98      0.99       213
  Portugeese       1.00      0.95      0.97       240
     Russian       1.00      0.95      0.97       242
     Spanish       0.97      0.95      0.96       284
    Sweedish       0.98      0.96      0.97       231
       Tamil       1.00      0.98      0.99       150
     Turkish       1.00      0.87      0.93       148

    accuracy              

In [137]:
# Forest regression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

forest_clf = Pipeline([
    ('cv', TfidfVectorizer()),
    ('forest', RandomForestClassifier(n_estimators=100)),
])

forest_clf.fit(XTrain, yTrain)

forest_pred = forest_clf.predict(XTest)

print(metrics.confusion_matrix(yTest,forest_pred))

[[127   0   0   1   0   0   0   0   0  13   0   0   0   0   0   0   0]
 [  0 122   1   2   1   0   0   0   0   6   0   0   0   1   4   0   0]
 [  0   1 184   2   0   1   0   0   1  13   0   0   0   1   1   0   0]
 [  0   0   1 431   0   0   0   0   0   7   0   0   0   3   0   0   0]
 [  0   1   2   1 310   1   0   0   4   7   0   1   0   7   0   0   0]
 [  0   4   0   2   1 142   0   0   1   9   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0 105   0   0  11   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  23   0   4   0   0   0   0   0   0   0]
 [  0   0   0   3   0   0   0   0 209  11   0   0   0   4   0   0   0]
 [  0   0   0   0   0   0   0   0   0 116   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  10 203   0   0   0   0   0   0]
 [  0   0   0   2   0   0   0   0   0  10   0 222   0   6   0   0   0]
 [  0   0   0   2   0   0   0   0   0  21   0   0 219   0   0   0   0]
 [  0   0   0   1   2   0   0   0   3  10   0   2   0 266   0   0   0]
 [  0 

In [138]:
print(metrics.classification_report(yTest,forest_pred))

              precision    recall  f1-score   support

      Arabic       1.00      0.90      0.95       141
      Danish       0.91      0.89      0.90       137
       Dutch       0.98      0.90      0.94       204
     English       0.96      0.98      0.97       442
      French       0.98      0.93      0.96       334
      German       0.99      0.89      0.94       159
       Greek       1.00      0.90      0.95       117
       Hindi       1.00      0.85      0.92        27
     Italian       0.95      0.92      0.94       227
     Kannada       0.40      1.00      0.58       116
   Malayalam       1.00      0.95      0.98       213
  Portugeese       0.99      0.93      0.95       240
     Russian       1.00      0.90      0.95       242
     Spanish       0.92      0.94      0.93       284
    Sweedish       0.98      0.94      0.96       231
       Tamil       1.00      0.98      0.99       150
     Turkish       1.00      0.79      0.88       148

    accuracy              

In [59]:
#Using grid search with Logistic regression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

log_pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3))), #Tfidf better for log regression
    ('classifier', LogisticRegression()),
])

params = {
    'classifier__max_iter': [2000],
    'classifier__C' : [ 80, 90, 100],
    'classifier__penalty' : ['l2'],
    'classifier__solver' : ['saga','liblinear'],
}

grid_search_log = GridSearchCV(log_pipe, params, cv=5, scoring='accuracy')

grid_search_log.fit(X, y)

In [60]:
# Display the best parameters and the corresponding score
print("Best Parameters:", grid_search_log.best_params_)
print("Best Cross-Validation Score:", grid_search_log.best_score_)

# Display detailed results
log_results = pd.DataFrame(grid_search_log.cv_results_)
print(log_results[['params', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'mean_score_time']])

Best Parameters: {'classifier__C': 90, 'classifier__max_iter': 2000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best Cross-Validation Score: 0.9592678163533241
                                              params  mean_test_score  \
0  {'classifier__C': 80, 'classifier__max_iter': ...         0.957526   
1  {'classifier__C': 80, 'classifier__max_iter': ...         0.958978   
2  {'classifier__C': 90, 'classifier__max_iter': ...         0.957140   
3  {'classifier__C': 90, 'classifier__max_iter': ...         0.959268   
4  {'classifier__C': 100, 'classifier__max_iter':...         0.956849   
5  {'classifier__C': 100, 'classifier__max_iter':...         0.959268   

   std_test_score  mean_fit_time  mean_score_time  
0        0.027120      18.685222         0.121589  
1        0.026184       4.248198         0.177288  
2        0.027772      20.915849         0.123852  
3        0.026417       4.382110         0.184182  
4        0.028453      22.885290         0.1282

In [61]:
log_results.to_csv("log-output.csv")

In [68]:
#Using grid search with random forest
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

forest_pipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', RandomForestClassifier()),
])

params = {
    'classifier__n_estimators' : [300, 400],
    'classifier__max_depth' : [None],
    'classifier__min_samples_split': [2, 10, 20],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__bootstrap': [True, False],
}

grid_search_forest = GridSearchCV(forest_pipe, params, cv=5, scoring='accuracy')

grid_search_forest.fit(X, y)

In [69]:
# Display the best parameters and the corresponding score
print("Best Parameters:", grid_search_forest.best_params_)
print("Best Cross-Validation Score:", grid_search_forest.best_score_)

# Display detailed results
forest_results = pd.DataFrame(grid_search_forest.cv_results_)
print(forest_results[['params', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'mean_score_time']])

Best Parameters: {'classifier__bootstrap': False, 'classifier__max_depth': None, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 20, 'classifier__n_estimators': 300}
Best Cross-Validation Score: 0.9429163169227401
                                               params  mean_test_score  \
0   {'classifier__bootstrap': True, 'classifier__m...         0.910023   
1   {'classifier__bootstrap': True, 'classifier__m...         0.909346   
2   {'classifier__bootstrap': True, 'classifier__m...         0.912829   
3   {'classifier__bootstrap': True, 'classifier__m...         0.912152   
4   {'classifier__bootstrap': True, 'classifier__m...         0.916408   
5   {'classifier__bootstrap': True, 'classifier__m...         0.916505   
6   {'classifier__bootstrap': True, 'classifier__m...         0.919020   
7   {'classifier__bootstrap': True, 'classifier__m...         0.917859   
8   {'classifier__bootstrap': True, 'classifier__m...         0.

In [70]:
forest_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__bootstrap,param_classifier__max_depth,param_classifier__max_features,param_classifier__min_samples_leaf,param_classifier__min_samples_split,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
40,15.820436,0.397498,1.325524,0.226395,False,,log2,1,20,300,"{'classifier__bootstrap': False, 'classifier__...",0.975338,0.984526,0.973875,0.894049,0.886792,0.942916,0.043079,1
41,21.288528,0.431175,1.73651,0.338012,False,,log2,1,20,400,"{'classifier__bootstrap': False, 'classifier__...",0.976789,0.984526,0.97194,0.89163,0.88776,0.942529,0.043342,2
38,21.904445,0.523992,1.317692,0.25153,False,,log2,1,10,300,"{'classifier__bootstrap': False, 'classifier__...",0.97824,0.985977,0.972424,0.89163,0.884373,0.942529,0.044787,3
17,15.286694,0.48446,1.624463,0.312376,True,,log2,1,20,400,"{'classifier__bootstrap': True, 'classifier__m...",0.974371,0.985977,0.972908,0.893082,0.884857,0.942239,0.043807,4
39,29.212308,0.870193,1.772798,0.395559,False,,log2,1,10,400,"{'classifier__bootstrap': False, 'classifier__...",0.975822,0.985977,0.972424,0.892114,0.88389,0.942045,0.044427,5
16,11.51167,0.236886,1.222106,0.195415,True,,log2,1,20,300,"{'classifier__bootstrap': True, 'classifier__m...",0.972921,0.98501,0.972908,0.890663,0.880987,0.940498,0.044962,6
37,82.966871,5.189525,1.79106,0.33479,False,,log2,1,2,400,"{'classifier__bootstrap': False, 'classifier__...",0.975338,0.98501,0.969521,0.888728,0.880019,0.939723,0.045546,7
36,62.513372,4.323307,1.384916,0.281392,False,,log2,1,2,300,"{'classifier__bootstrap': False, 'classifier__...",0.974855,0.984043,0.970005,0.889695,0.878568,0.939433,0.045514,8
15,20.885308,0.651324,1.630868,0.287202,True,,log2,1,10,400,"{'classifier__bootstrap': True, 'classifier__m...",0.971954,0.981625,0.96807,0.880019,0.881471,0.936628,0.045843,9
14,15.732665,0.2201,1.248152,0.225461,True,,log2,1,10,300,"{'classifier__bootstrap': True, 'classifier__m...",0.972437,0.982108,0.968553,0.880503,0.879052,0.936531,0.046551,10


In [72]:
forest_results.to_csv("forest-output.csv")

In [17]:
#Using grid search with Linear SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

lin_pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()), #TFIDF Count Vectorizer better for Linear SVC
    ('classifier', LinearSVC()),
])

params = {
    'classifier__C' : [0.01, 0.05, .1, 0.5, 1],
    'classifier__loss': ['hinge', 'squared_hinge'],
    'classifier__penalty': ['l2'],
    'classifier__dual': [False, True],
    'classifier__max_iter': [1000, 5000]
}

grid_search_linear = GridSearchCV(lin_pipe, params, cv=5, scoring='accuracy')

grid_search_linear.fit(X, y)

50 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\aaron\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\aaron\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aaron\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\aaron\anaconda3\Lib\site-packages\sklearn\base.py", l

In [18]:
# Display the best parameters and the corresponding score
print("Best Parameters:", grid_search_linear.best_params_)
print("Best Cross-Validation Score:", grid_search_linear.best_score_)

# Display detailed results
linear_results = pd.DataFrame(grid_search_linear.cv_results_)
print(linear_results[['params', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'mean_score_time']])

Best Parameters: {'classifier__C': 0.5, 'classifier__dual': True, 'classifier__loss': 'hinge', 'classifier__max_iter': 1000, 'classifier__penalty': 'l2'}
Best Cross-Validation Score: 0.9637183370623756
                                               params  mean_test_score  \
0   {'classifier__C': 0.01, 'classifier__dual': Fa...              NaN   
1   {'classifier__C': 0.01, 'classifier__dual': Fa...              NaN   
2   {'classifier__C': 0.01, 'classifier__dual': Fa...         0.915830   
3   {'classifier__C': 0.01, 'classifier__dual': Fa...         0.915830   
4   {'classifier__C': 0.01, 'classifier__dual': Tr...         0.952690   
5   {'classifier__C': 0.01, 'classifier__dual': Tr...         0.952690   
6   {'classifier__C': 0.01, 'classifier__dual': Tr...         0.915830   
7   {'classifier__C': 0.01, 'classifier__dual': Tr...         0.915830   
8   {'classifier__C': 0.05, 'classifier__dual': Fa...              NaN   
9   {'classifier__C': 0.05, 'classifier__dual': Fa...     

In [20]:
linear_results.to_csv("lin-output.csv")

In [35]:
#Using grid search with Naive Bayes
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

nb_pipe = Pipeline([
    ('vectorizer', CountVectorizer()), #Count Vectorizer better for MultinomialNB
    ('classifier', MultinomialNB()),
])

params = {
    'classifier__alpha' : [0.01, 0.03, 0.08, 0.1, 0.15, 0.25, 0.35, 0.5],
    'classifier__fit_prior': [False, True],
}

grid_search_nb = GridSearchCV(nb_pipe, params, cv=5, scoring='accuracy')

grid_search_nb.fit(X, y)

In [36]:
# Display the best parameters and the corresponding score
print("Best Parameters:", grid_search_nb.best_params_)
print("Best Cross-Validation Score:", grid_search_nb.best_score_)

# Display detailed results
nb_results = pd.DataFrame(grid_search_nb.cv_results_)
print(nb_results[['params', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'mean_score_time']])

Best Parameters: {'classifier__alpha': 0.1, 'classifier__fit_prior': False}
Best Cross-Validation Score: 0.9770702734974112
                                               params  mean_test_score  \
0   {'classifier__alpha': 0.01, 'classifier__fit_p...         0.976006   
1   {'classifier__alpha': 0.01, 'classifier__fit_p...         0.975813   
2   {'classifier__alpha': 0.03, 'classifier__fit_p...         0.976490   
3   {'classifier__alpha': 0.03, 'classifier__fit_p...         0.976006   
4   {'classifier__alpha': 0.08, 'classifier__fit_p...         0.977070   
5   {'classifier__alpha': 0.08, 'classifier__fit_p...         0.976103   
6   {'classifier__alpha': 0.1, 'classifier__fit_pr...         0.977070   
7   {'classifier__alpha': 0.1, 'classifier__fit_pr...         0.976006   
8   {'classifier__alpha': 0.15, 'classifier__fit_p...         0.976877   
9   {'classifier__alpha': 0.15, 'classifier__fit_p...         0.975716   
10  {'classifier__alpha': 0.25, 'classifier__fit_p...         

In [38]:
nb_results.to_csv("mnb-output.csv")

In [32]:
#Using grid search with Naive Bayes
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

bnb_pipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', BernoulliNB()),
])

params = {
    'classifier__alpha' : [0.01, 0.1, 0.5, 1, 10],
    'classifier__fit_prior': [False, True],
    'classifier__binarize': [0.0, 0.1, 0.5, None]
}

grid_search_bnb = GridSearchCV(bnb_pipe, params, cv=5, scoring='accuracy')

grid_search_bnb.fit(X, y)

  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  neg_prob = np.log(1 - np.exp(self.feature_log_

In [33]:
# Display the best parameters and the corresponding score
print("Best Parameters:", grid_search_bnb.best_params_)
print("Best Cross-Validation Score:", grid_search_bnb.best_score_)

# Display detailed results
bnb_results = pd.DataFrame(grid_search_bnb.cv_results_)
print(bnb_results[['params', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'mean_score_time']])

Best Parameters: {'classifier__alpha': 0.01, 'classifier__binarize': 0.0, 'classifier__fit_prior': True}
Best Cross-Validation Score: 0.9547194141333041
                                               params  mean_test_score  \
0   {'classifier__alpha': 0.01, 'classifier__binar...         0.952107   
1   {'classifier__alpha': 0.01, 'classifier__binar...         0.954719   
2   {'classifier__alpha': 0.01, 'classifier__binar...         0.952107   
3   {'classifier__alpha': 0.01, 'classifier__binar...         0.954719   
4   {'classifier__alpha': 0.01, 'classifier__binar...         0.952107   
5   {'classifier__alpha': 0.01, 'classifier__binar...         0.954719   
6   {'classifier__alpha': 0.01, 'classifier__binar...         0.133985   
7   {'classifier__alpha': 0.01, 'classifier__binar...         0.133985   
8   {'classifier__alpha': 0.1, 'classifier__binari...         0.944173   
9   {'classifier__alpha': 0.1, 'classifier__binari...         0.944754   
10  {'classifier__alpha': 0.1, 'c

In [34]:
bnb_results.to_csv("bnb-output.csv")

In [6]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
import numpy as np
from joblib import dump

linPipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', CalibratedClassifierCV(LinearSVC(C=0.5,dual=True,loss='hinge',max_iter=1000,penalty='l2')))
])
                   
skf = StratifiedKFold(n_splits=5)

scores = cross_val_score(linPipe, X, y, cv=skf, scoring='accuracy')
print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {np.mean(scores)}")

linPipe.fit(X, y)

dump(linPipe, 'lin_model2.pkl')

Cross-validation accuracy scores: [0.98210832 0.98742747 0.97822932 0.93226899 0.92597968]
Mean accuracy: 0.9612027541573909


['lin_model2.pkl']

In [84]:
from sklearn.naive_bayes import MultinomialNB

mnbPipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB(alpha=0.1,fit_prior=False))
])

skf = StratifiedKFold(n_splits=5)

scores = cross_val_score(mnbPipe, X, y, cv=skf, scoring='accuracy')
print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {np.mean(scores)}")

mnbPipe.fit(X, y)

dump(mnbPipe, 'mnb_model.pkl')

Cross-validation accuracy scores: [0.98742747 0.99129594 0.98548621 0.96274794 0.95839381]
Mean accuracy: 0.9770702734974112


['mnb_model.pkl']

In [85]:
from sklearn.naive_bayes import BernoulliNB

bnbPipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', BernoulliNB(alpha=0.01,binarize=0,fit_prior=True))
])

skf = StratifiedKFold(n_splits=5)

scores = cross_val_score(bnbPipe, X, y, cv=skf, scoring='accuracy')
print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {np.mean(scores)}")

bnbPipe.fit(X, y)

dump(bnbPipe, 'bnb_model.pkl')

Cross-validation accuracy scores: [0.98500967 0.98984526 0.98258345 0.92114175 0.89501693]
Mean accuracy: 0.9547194141333041


['bnb_model.pkl']

In [86]:
from sklearn.linear_model import LogisticRegression

logPipe = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3))),
    ('classifier', LogisticRegression(C=90,max_iter=2000,penalty='l2',solver='liblinear'))
])

skf = StratifiedKFold(n_splits=5)

scores = cross_val_score(logPipe, X, y, cv=skf, scoring='accuracy')
print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {np.mean(scores)}")

logPipe.fit(X, y)

dump(logPipe, 'log_model.pkl')

Cross-validation accuracy scores: [0.9787234  0.98839458 0.97435897 0.92936623 0.92549589]
Mean accuracy: 0.9592678163533241


['log_model.pkl']

In [87]:
from sklearn.ensemble import RandomForestClassifier

frstPipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', RandomForestClassifier(bootstrap=False, max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=20, n_estimators=300))
])

skf = StratifiedKFold(n_splits=5)

scores = cross_val_score(frstPipe, X, y, cv=skf, scoring='accuracy')
print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {np.mean(scores)}")

frstPipe.fit(X, y)

dump(frstPipe, 'randforest_model.pkl')

Cross-validation accuracy scores: [0.97630561 0.98452611 0.97387518 0.89404935 0.88630866]
Mean accuracy: 0.9430129819330944


['randforest_model.pkl']