In [1]:
import pandas as pd
import os
import csv
import numpy as np

## sklearn
import sklearn # machine learning
from sklearn.feature_extraction.text import CountVectorizer # frequency counts matrix
from sklearn.model_selection import train_test_split # splitting up data
from sklearn import metrics # for accuracy/ precision
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier 

In [2]:
df = pd.read_csv(r'C:/Users/student/Documents/dataset/labeledCom.csv') # read in the data

In [3]:
moddf=df.fillna(0)

In [4]:
moddf.isnull().sum()

label      0
comment    0
dtype: int64

In [5]:
moddf.to_csv(r'C:/Users/student/Documents/new2.csv',index=False)

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                        moddf["comment"], moddf["label"], 
                                        test_size=0.25, 
                                       random_state=42)

In [7]:
type(X_test)

pandas.core.series.Series

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer


In [9]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [10]:
x_train_counts = vectorizer.fit_transform(X_train)
x_test_counts = vectorizer.transform(X_test)

In [11]:
type(x_train_counts)

scipy.sparse.csr.csr_matrix

In [12]:
type(x_test_counts)

scipy.sparse.csr.csr_matrix

In [13]:
mnb = MultinomialNB()
mnb.fit(x_train_counts, Y_train) # fit the model on the training data word counts and training data lables

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
predicted_labels = mnb.predict(x_test_counts) # make our y predictions (labels) on the comment test data

In [15]:
for i in predicted_labels[:10]:
    print (i)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [16]:
tfidf_transformer = TfidfTransformer()
x_tfidf_tr = tfidf_transformer.fit_transform(x_train_counts)
x_tfidf_tst = tfidf_transformer.transform(x_test_counts)
mnb2 = MultinomialNB()
mnb2.fit(x_tfidf_tr, Y_train)
tfidf_pred = mnb2.predict(x_tfidf_tst)
tfidf_acc = metrics.accuracy_score(Y_test, tfidf_pred)
print('We obtained ', round(tfidf_acc, 6), '% accuracy for the tf-idf transformed model')

We obtained  0.721251 % accuracy for the tf-idf transformed model


In [17]:
acc = metrics.accuracy_score(Y_test, predicted_labels)

In [18]:
print('We obtained ', round(acc, 4), '% accuracy for the model!')

We obtained  0.7139 % accuracy for the model!


In [19]:
print('Here is the Classification Report: \n')
print(metrics.classification_report(Y_test, predicted_labels))

Here is the Classification Report: 

              precision    recall  f1-score   support

        -1.0       0.67      0.03      0.06       123
         0.0       0.72      0.98      0.83       784
         1.0       0.21      0.02      0.04       180

   micro avg       0.71      0.71      0.71      1087
   macro avg       0.53      0.34      0.31      1087
weighted avg       0.63      0.71      0.61      1087



In [20]:
print('Here is the Confusion Matrix: \n')
metrics.confusion_matrix(Y_test, predicted_labels)

Here is the Confusion Matrix: 



array([[  4, 118,   1],
       [  2, 768,  14],
       [  0, 176,   4]], dtype=int64)

In [21]:
from sklearn.pipeline import Pipeline
from sklearn import svm

In [22]:
svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', svm.SVC())
                   ])
svm_clf.fit(X_train, Y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [23]:
svm_predicted = svm_clf.predict(X_test)
np.mean(svm_predicted == Y_test)

0.7212511499540019

In [24]:
SGD_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                          alpha=1e-3, random_state=42,
                                          max_iter=5, tol=None))])

SGD_clf.fit(X_train, Y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [25]:
SGD_predicted = SGD_clf.predict(X_test)
np.mean(SGD_predicted == Y_test)

0.7212511499540019

In [26]:
sgd = SGDClassifier(loss='log', penalty='l2', alpha=1e-3, max_iter=5, tol=None, random_state=1) 
sgd.fit(x_train_counts, Y_train)
sgd_predict = sgd.predict(x_test_counts)
sgd_acc = metrics.accuracy_score(Y_test, sgd_predict)
print('We obtained ', round(sgd_acc, 6), '% accuracy for the logistic regression model')

We obtained  0.715731 % accuracy for the logistic regression model
