In [7]:
import DataPrep
import FeatureSelection
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import  LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [8]:
#string to test
doc_new = ['obama is running for president in 2016']

In [9]:
#the feature selection has been done in FeatureSelection.py module. here we will create models using those features for prediction

#first we will use bag of words techniques


In [27]:
#building classifier using naive bayes 
nb_pipeline = Pipeline([
        ('NBCV',FeatureSelection.countV),
        ('nb_clf',MultinomialNB())])

nb_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_nb = nb_pipeline.predict(DataPrep.test_news['Statement'])
np.mean(predicted_nb == DataPrep.test_news['Label'])

0.6072128577028616

In [12]:
#building classifier using logistic regression
logR_pipeline = Pipeline([
        ('LogRCV',FeatureSelection.countV),
        ('LogR_clf',LogisticRegression())
        ])
logR_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_LogR = logR_pipeline.predict(DataPrep.test_news['Statement'])
np.mean(predicted_LogR == DataPrep.test_news['Label'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6013328106624853

In [13]:
#building Linear SVM classfier
svm_pipeline = Pipeline([
        ('svmCV',FeatureSelection.countV),
        ('svm_clf',svm.LinearSVC())
        ])

svm_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_svm = svm_pipeline.predict(DataPrep.test_news['Statement'])
np.mean(predicted_svm == DataPrep.test_news['Label'])



0.5723245785966288

In [15]:
#using SVM Stochastic Gradient Descent on hinge loss
sgd_pipeline = Pipeline([
        ('svm2CV',FeatureSelection.countV),
        ('svm2_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5))
        ])

sgd_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_sgd = sgd_pipeline.predict(DataPrep.test_news['Statement'])
np.mean(predicted_sgd == DataPrep.test_news['Label'])



0.6087808702469619

In [16]:
#random forest
random_forest = Pipeline([
        ('rfCV',FeatureSelection.countV),
        ('rf_clf',RandomForestClassifier(n_estimators=200,n_jobs=3))
        ])
    
random_forest.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_rf = random_forest.predict(DataPrep.test_news['Statement'])
np.mean(predicted_rf == DataPrep.test_news['Label'])

0.6256370050960408

In [17]:
#User defined functon for K-Fold cross validatoin
def build_confusion_matrix(classifier):
    
    k_fold = KFold(n_splits=5)
    scores = []
    confusion = np.array([[0,0],[0,0]])

    for train_ind, test_ind in k_fold.split(DataPrep.train_news):
        train_text = DataPrep.train_news.iloc[train_ind]['Statement'] 
        train_y = DataPrep.train_news.iloc[train_ind]['Label']
    
        test_text = DataPrep.train_news.iloc[test_ind]['Statement']
        test_y = DataPrep.train_news.iloc[test_ind]['Label']
        
        classifier.fit(train_text,train_y)
        predictions = classifier.predict(test_text)
        
        confusion += confusion_matrix(test_y,predictions)
        score = f1_score(test_y,predictions)
        scores.append(score)
    
    return (print('Total statements classified:', len(DataPrep.train_news)),
    print('Score:', sum(scores)/len(scores)),
    print('score length', len(scores)),
    print('Confusion matrix:'),
    print(confusion))

In [20]:
#K-fold cross validation for all classifiers
build_confusion_matrix(nb_pipeline)

Total statements classified: 10240
Score: 0.66961153965076
score length 5
Confusion matrix:
[[2118 2370]
 [1664 4088]]


(None, None, None, None, None)

In [22]:
build_confusion_matrix(logR_pipeline)

Total statements classified: 10240
Score: 0.6470221003039508
score length 5
Confusion matrix:
[[2252 2236]
 [1932 3820]]


(None, None, None, None, None)

In [23]:
build_confusion_matrix(svm_pipeline)




Total statements classified: 10240
Score: 0.6104687487924284
score length 5
Confusion matrix:
[[2260 2228]
 [2246 3506]]


(None, None, None, None, None)

In [24]:
build_confusion_matrix(sgd_pipeline)



Total statements classified: 10240
Score: 0.6563223294384912
score length 5
Confusion matrix:
[[2115 2373]
 [1781 3971]]




(None, None, None, None, None)

In [25]:
build_confusion_matrix(random_forest)

Total statements classified: 10240
Score: 0.700337510384436
score length 5
Confusion matrix:
[[1808 2680]
 [1208 4544]]


(None, None, None, None, None)

In [30]:

"""So far we have used bag of words technique to extract the features and passed those featuers into classifiers. We have also seen the
f1 scores of these classifiers. now lets enhance these features using term frequency weights with various n-grams
"""

##Now using n-grams
#naive-bayes classifier
nb_pipeline_ngram = Pipeline([
        ('nb_tfidf',FeatureSelection.tfidf_ngram),
        ('nb_clf',MultinomialNB())])

nb_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_nb_ngram = nb_pipeline_ngram.predict(DataPrep.test_news['Statement'])
print(np.mean(predicted_nb_ngram == DataPrep.test_news['Label']))


#logistic regression classifier
logR_pipeline_ngram = Pipeline([
        ('LogR_tfidf',FeatureSelection.tfidf_ngram),
        ('LogR_clf',LogisticRegression(penalty="l2",C=1))
        ])

logR_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_LogR_ngram = logR_pipeline_ngram.predict(DataPrep.test_news['Statement'])
print(np.mean(predicted_LogR_ngram == DataPrep.test_news['Label']))


#linear SVM classifier
svm_pipeline_ngram = Pipeline([
        ('svm_tfidf',FeatureSelection.tfidf_ngram),
        ('svm_clf',svm.LinearSVC())
        ])

svm_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_svm_ngram = svm_pipeline_ngram.predict(DataPrep.test_news['Statement'])
print(np.mean(predicted_svm_ngram == DataPrep.test_news['Label']))


#sgd classifier
sgd_pipeline_ngram = Pipeline([
         ('sgd_tfidf',FeatureSelection.tfidf_ngram),
         ('sgd_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5))
         ])

sgd_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_sgd_ngram = sgd_pipeline_ngram.predict(DataPrep.test_news['Statement'])
print(np.mean(predicted_sgd_ngram == DataPrep.test_news['Label']))


#random forest classifier
# random_forest_ngram = Pipeline([
#         ('rf_tfidf',FeatureSelection.tfidf_ngram),
#         ('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3))
#         ])
    
# random_forest_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
# predicted_rf_ngram = random_forest_ngram.predict(DataPrep.test_news['Statement'])
# print(np.mean(predicted_rf_ngram == DataPrep.test_news['Label']))


0.5938847510780086
0.6185809486475892




0.6170129361034888
0.5417483339866719




In [32]:
build_confusion_matrix(nb_pipeline_ngram)

Total statements classified: 10240
Score: 0.7224053159841455
score length 5
Confusion matrix:
[[ 758 3730]
 [ 390 5362]]


(None, None, None, None, None)

In [33]:
build_confusion_matrix(logR_pipeline_ngram)

Total statements classified: 10240
Score: 0.7042876638233403
score length 5
Confusion matrix:
[[1581 2907]
 [1045 4707]]


(None, None, None, None, None)

In [34]:
build_confusion_matrix(svm_pipeline_ngram)



Total statements classified: 10240
Score: 0.6790920142902143
score length 5
Confusion matrix:
[[2016 2472]
 [1524 4228]]


(None, None, None, None, None)

In [35]:
build_confusion_matrix(sgd_pipeline_ngram)



Total statements classified: 10240
Score: 0.7190643331130575
score length 5
Confusion matrix:
[[   5 4483]
 [   6 5746]]




(None, None, None, None, None)

In [36]:
# build_confusion_matrix(random_forest_ngram)

In [40]:
print(classification_report(DataPrep.test_news['Label'], predicted_nb_ngram))
print(classification_report(DataPrep.test_news['Label'], predicted_LogR_ngram))
print(classification_report(DataPrep.test_news['Label'], predicted_svm_ngram))
print(classification_report(DataPrep.test_news['Label'], predicted_sgd_ngram))
# print(classification_report(DataPrep.test_news['Label'], predicted_rf_ngram))

              precision    recall  f1-score   support

       False       0.72      0.19      0.30      1169
        True       0.58      0.94      0.71      1382

    accuracy                           0.59      2551
   macro avg       0.65      0.56      0.51      2551
weighted avg       0.64      0.59      0.52      2551

              precision    recall  f1-score   support

       False       0.64      0.39      0.48      1169
        True       0.61      0.81      0.70      1382

    accuracy                           0.62      2551
   macro avg       0.62      0.60      0.59      2551
weighted avg       0.62      0.62      0.60      2551

              precision    recall  f1-score   support

       False       0.61      0.47      0.53      1169
        True       0.62      0.74      0.68      1382

    accuracy                           0.62      2551
   macro avg       0.61      0.61      0.60      2551
weighted avg       0.62      0.62      0.61      2551

              preci

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:

"""
Out of all the models fitted, we would take 2 best performing model. we would call them candidate models
from the confusion matrix, we can see that random forest and logistic regression are best performing 
in terms of precision and recall (take a look into false positive and true negative counts which appeares
to be low compared to rest of the models)
"""
DataPrep.test_news['Label'].shape


(2551,)

In [44]:

# #grid-search parameter optimization
# #random forest classifier parameters
# parameters = {'rf_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
#                'rf_tfidf__use_idf': (True, False),
#                'rf_clf__max_depth': (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)
# }

# gs_clf = GridSearchCV(random_forest_ngram, parameters, n_jobs=-1)
# gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000])

# gs_clf.best_score_
# gs_clf.best_params_
# gs_clf.cv_results_

In [46]:
#logistic regression parameters
parameters = {'LogR_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
               'LogR_tfidf__use_idf': (True, False),
               'LogR_tfidf__smooth_idf': (True, False)
}

gs_clf = GridSearchCV(logR_pipeline_ngram, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000])

gs_clf.best_score_
gs_clf.best_params_
gs_clf.cv_results_

{'mean_fit_time': array([0.51498618, 0.50580206, 0.55451908, 0.48259206, 1.49758472,
        1.91337743, 1.94042358, 1.81324525, 2.67262216, 2.68769445,
        2.13458819, 2.7334024 , 3.70294614, 3.68034873, 2.99725838,
        3.30808096, 3.8788918 , 4.4731307 , 3.627105  , 3.96107297]),
 'std_fit_time': array([0.07249428, 0.02747269, 0.02912707, 0.02610611, 0.04418004,
        0.17885938, 0.14435783, 0.26637263, 0.22289564, 0.16804268,
        0.14615459, 0.22147051, 0.45260204, 0.18330022, 0.12727933,
        0.20124522, 0.3694826 , 0.48983952, 0.11738111, 0.39265556]),
 'mean_score_time': array([0.09186759, 0.08498287, 0.09085588, 0.08237467, 0.15127215,
        0.20047255, 0.17806821, 0.13675938, 0.17253098, 0.17034907,
        0.17408266, 0.17356949, 0.29385829, 0.19976573, 0.19039807,
        0.18326821, 0.23344555, 0.20095205, 0.20960693, 0.18896599]),
 'std_score_time': array([0.01451422, 0.01699036, 0.00718271, 0.01012486, 0.02639695,
        0.03359004, 0.04411491, 0.015641

In [47]:
#Linear SVM 
parameters = {'svm_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
               'svm_tfidf__use_idf': (True, False),
               'svm_tfidf__smooth_idf': (True, False),
               'svm_clf__penalty': ('l1','l2'),
}

gs_clf = GridSearchCV(svm_pipeline_ngram, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000])

gs_clf.best_score_
gs_clf.best_params_
gs_clf.cv_results_

100 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hayan\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hayan\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Hayan\AppData\Roaming\Python\Python312\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\

{'mean_fit_time': array([0.39515166, 0.53613505, 0.46541924, 0.30848932, 0.7932415 ,
        0.8403564 , 0.79507561, 0.78921242, 1.25976148, 1.45533242,
        1.38110485, 1.28953753, 1.94331169, 1.67899246, 1.77556677,
        1.84174004, 2.120821  , 2.04415259, 2.11058421, 2.02098465,
        0.38659768, 0.43227129, 0.41284575, 0.50077963, 1.01884933,
        0.96103616, 1.06247225, 1.07158747, 1.94453149, 2.31135535,
        2.7620647 , 2.63665433, 3.08317952, 2.99544549, 2.72082272,
        2.23410115, 2.79273458, 2.59300909, 2.67454052, 2.60576797]),
 'std_fit_time': array([0.03869222, 0.10698775, 0.19282231, 0.02089596, 0.05000303,
        0.03306166, 0.05935648, 0.01746673, 0.05675825, 0.05151279,
        0.05714498, 0.02308713, 0.09751565, 0.02697147, 0.05220161,
        0.03591047, 0.02256627, 0.04026503, 0.04118468, 0.03327305,
        0.01463497, 0.04297378, 0.02414127, 0.07709986, 0.05816285,
        0.04953142, 0.05589658, 0.06677874, 0.13224402, 0.33295585,
        0.288

In [48]:
# #by running above commands we can find the model with best performing parameters


# #running both random forest and logistic regression models again with best parameter found with GridSearch method
# random_forest_final = Pipeline([
#         ('rf_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,3),use_idf=True,smooth_idf=True)),
#         ('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3,max_depth=10))
#         ])
    
# random_forest_final.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
# predicted_rf_final = random_forest_final.predict(DataPrep.test_news['Statement'])
# np.mean(predicted_rf_final == DataPrep.test_news['Label'])
# print(metrics.classification_report(DataPrep.test_news['Label'], predicted_rf_final))

In [50]:
import sklearn.metrics as metrics

logR_pipeline_final = Pipeline([
        #('LogRCV',countV_ngram),
        ('LogR_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,5),use_idf=True,smooth_idf=False)),
        ('LogR_clf',LogisticRegression(penalty="l2",C=1))
        ])

logR_pipeline_final.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_LogR_final = logR_pipeline_final.predict(DataPrep.test_news['Statement'])
np.mean(predicted_LogR_final == DataPrep.test_news['Label'])
#accuracy = 0.62
print(metrics.classification_report(DataPrep.test_news['Label'], predicted_LogR_final))

              precision    recall  f1-score   support

       False       0.65      0.38      0.48      1169
        True       0.61      0.82      0.70      1382

    accuracy                           0.62      2551
   macro avg       0.63      0.60      0.59      2551
weighted avg       0.63      0.62      0.60      2551



In [55]:
"""
by running both random forest and logistic regression with GridSearch's best parameter estimation, we found that for random 
forest model with n-gram has better accuracty than with the parameter estimated. The logistic regression model with best parameter 
has almost similar performance as n-gram model so logistic regression will be out choice of model for prediction.
"""

#saving best model to the disk
model_file = 'final_model.sav'
pickle.dump(logR_pipeline_ngram,open(model_file,'wb'))

In [53]:
#Plotting learing curve
def plot_learing_curve(pipeline,title):
    size = 10000
    cv = KFold(size, shuffle=True)
    
    X = DataPrep.train_news["Statement"]
    y = DataPrep.train_news["Label"]
    
    pl = pipeline
    pl.fit(X,y)
    
    train_sizes, train_scores, test_scores = learning_curve(pl, X, y, n_jobs=-1, cv=cv, train_sizes=np.linspace(.1, 1.0, 5), verbose=0)
       
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
     
    plt.figure()
    plt.title(title)
    plt.legend(loc="best")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.gca().invert_yaxis()
    
    # box-like grid
    plt.grid()
    
    # plot the std deviation as a transparent range at each training set size
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
    # plot the average training and test score lines at each training set size
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    # sizes the window for readability and displays the plot
    # shows error from 0 to 1.1
    plt.ylim(-.1,1.1)
    plt.show()


In [None]:
plot_learing_curve(logR_pipeline_ngram,"Naive-bayes Classifier")

In [None]:
plot_learing_curve(nb_pipeline_ngram,"LogisticRegression Classifier")

In [None]:
plot_learing_curve(svm_pipeline_ngram,"SVM Classifier")

In [None]:
plot_learing_curve(sgd_pipeline_ngram,"SGD Classifier")

In [None]:
# plot_learing_curve(random_forest_ngram,"RandomForest Classifier")