# Dependencies

In [1]:
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Importing and Cleaning Data

In [2]:
df = pd.read_json("Data_cyb.json", lines = True, orient = "columns")

In [3]:
rating = []

for i in df["annotation"]:
    rating.append(int(i["label"][0]))
    
df["rating"] = rating

In [4]:
df.head()

Unnamed: 0,content,annotation,extras,rating
0,Get fucking real dude.,"{'notes': '', 'label': ['1']}",,1
1,She is as dirty as they come and that crook ...,"{'notes': '', 'label': ['1']}",,1
2,why did you fuck it up. I could do it all day...,"{'notes': '', 'label': ['1']}",,1
3,Dude they dont finish enclosing the fucking s...,"{'notes': '', 'label': ['1']}",,1
4,WTF are you talking about Men? No men thats n...,"{'notes': '', 'label': ['1']}",,1


In [5]:
tweets = pd.read_csv("Test_Twitter_Comments.csv")
tweets.tail()

Unnamed: 0,content,rating
96,That is someone who does it from their heart. ...,1
97,Absolutely applaud your work to secure freedom...,0
98,You'll never learn it till you actually live i...,1
99,Nothing on the reinstatement of federal Capito...,1
100,Crickets,0


In [6]:
new_df1 = df[["content", "rating"]]

In [7]:
new_df = pd.concat([new_df1,tweets])

In [8]:
X, X_test, y, y_test = train_test_split(new_df["content"], new_df["rating"], train_size = 0.8)


In [9]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(X)
reviews_test_clean = preprocess_reviews(X_test)

# Logistic Regression

## Baseline Logistic Regression

In [10]:
baseline_vectorizer = CountVectorizer(binary=True)
baseline_vectorizer.fit(reviews_train_clean)
X_baseline = baseline_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X_baseline, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6801790599353394
Accuracy for C=0.05: 0.7284257647351405
Accuracy for C=0.25: 0.7813976622730664
Accuracy for C=0.5: 0.797314100969908
Accuracy for C=1: 0.8157174832131311


### Has room to learn

## Logistic Regression, Removal Stop Words

In [11]:
from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

In [12]:
no_stop_words_train = remove_stop_words(reviews_train_clean)

cv = CountVectorizer(binary=True)
cv.fit(no_stop_words_train)
X = cv.transform(no_stop_words_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.681173837353892
Accuracy for C=0.05: 0.7212136284506342
Accuracy for C=0.25: 0.7726933598607312
Accuracy for C=0.5: 0.7896045759761253
Accuracy for C=1: 0.8062670977368813


### Still has room to learn

## Logistic Regression, Stemming

In [13]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = get_stemmed_text(reviews_train_clean)

cv = CountVectorizer(binary=True)
cv.fit(stemmed_reviews_train)
X = cv.transform(stemmed_reviews_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6809251429992539
Accuracy for C=0.05: 0.7169858244217856
Accuracy for C=0.25: 0.7667246953494156
Accuracy for C=0.5: 0.7888584929122109
Accuracy for C=1: 0.8007958219348421


### Still has room to learn

## Logistic Regression, Lemmatization

In [14]:
def get_lemmatized_text(corpus):
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)

cv = CountVectorizer(binary=True)
cv.fit(lemmatized_reviews_train)
X = cv.transform(lemmatized_reviews_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6774434220343198
Accuracy for C=0.05: 0.7097736881372793
Accuracy for C=0.25: 0.7647351405123104
Accuracy for C=0.5: 0.7871176324297439
Accuracy for C=1: 0.8047749316090524


### Still has room to learn

## Logistic Regression, Word Count

In [15]:
wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6637652325292216
Accuracy for C=0.05: 0.7047998010445163
Accuracy for C=0.25: 0.7590151703556329
Accuracy for C=0.5: 0.7868689380751057
Accuracy for C=1: 0.8047749316090524


### Still has room to learn

## Logistic Regression, N-grams

### 2-grams

In [16]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6958468042775429
Accuracy for C=0.05: 0.7811489679184282
Accuracy for C=0.25: 0.8492912210892812
Accuracy for C=0.5: 0.8589903009201691
Accuracy for C=1: 0.8619746331758269


### Still has room to learn

### 3-grams

In [17]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.7197214623228053
Accuracy for C=0.05: 0.8162148719224074
Accuracy for C=0.25: 0.868689380751057
Accuracy for C=0.5: 0.8766476000994777
Accuracy for C=1: 0.877144988808754


### Still has room to learn

### 4-grams

In [18]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.7311614026361601
Accuracy for C=0.05: 0.8261626461079333
Accuracy for C=0.25: 0.8629694105943795
Accuracy for C=0.5: 0.8649589654314846
Accuracy for C=1: 0.8681919920417807


### Still has room to learn

# Support Vector Machines (SVM)

In [19]:
from sklearn.svm import LinearSVC

## SVM, Removal of Stop Words

In [20]:
no_stop_words_train = remove_stop_words(reviews_train_clean)

cv = CountVectorizer(binary=True)
cv.fit(no_stop_words_train)
X = cv.transform(no_stop_words_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.7363839840835613
Accuracy for C=0.05: 0.780651579209152
Accuracy for C=0.25: 0.8162148719224074
Accuracy for C=0.5: 0.82392439691619
Accuracy for C=1: 0.8259139517532952


### Still has room to learn

## SVM, Stemming

In [21]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = get_stemmed_text(reviews_train_clean)

cv = CountVectorizer(binary=True)
cv.fit(stemmed_reviews_train)
X = cv.transform(stemmed_reviews_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.7368813727928376
Accuracy for C=0.05: 0.7868689380751057
Accuracy for C=0.25: 0.8194478985327033
Accuracy for C=0.5: 0.8231783138522756
Accuracy for C=1: 0.8269087291718478


### Still has room to learn

## SVM, Lemmatization

In [22]:
def get_lemmatized_text(corpus):
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)

cv = CountVectorizer(binary=True)
cv.fit(lemmatized_reviews_train)
X = cv.transform(lemmatized_reviews_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.7411091768216862
Accuracy for C=0.05: 0.797314100969908
Accuracy for C=0.25: 0.8264113404625715
Accuracy for C=0.5: 0.8298930614275056
Accuracy for C=1: 0.8318826162646108


### Still has room to learn

## SVM, Word Count

In [23]:
wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.7244466550609301
Accuracy for C=0.05: 0.7896045759761253
Accuracy for C=0.25: 0.8224322307883611
Accuracy for C=0.5: 0.8296443670728675
Accuracy for C=1: 0.825665257398657


### Still has room to learn

## SVM, N-Grams

### n = 2

In [24]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8174583436955981
Accuracy for C=0.05: 0.8679432976871425
Accuracy for C=0.25: 0.8649589654314846
Accuracy for C=0.5: 0.8632181049490176
Accuracy for C=1: 0.8584929122108929


### Still has room to learn

### n = 3

In [25]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8413330017408605
Accuracy for C=0.05: 0.8674459089778662
Accuracy for C=0.25: 0.8599850783387217
Accuracy for C=0.5: 0.8565033573737876
Accuracy for C=1: 0.8510320815717484


### Still has room to learn

### n = 4

In [26]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8547624968913206
Accuracy for C=0.05: 0.8719224073613528
Accuracy for C=0.25: 0.8634667993036558
Accuracy for C=0.5: 0.8574981347923402
Accuracy for C=1: 0.852026858990301


### Based on the comparison between the combination of the various vectorization with logistic rergression and SVM, 4-gram vectorization with SVM seemed stablize with the greatest accuracy. We later, however, sought to see if stemming, lemmatization, and/or the removal of stop words would improve the model.

## SVM, 4-grams, Stemming

In [27]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(stemmed_reviews_train)
X = ngram_vectorizer.transform(stemmed_reviews_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.862223327530465
Accuracy for C=0.05: 0.882118875901517
Accuracy for C=0.25: 0.8746580452623726
Accuracy for C=0.5: 0.8706789355881621
Accuracy for C=1: 0.8684406863964188


## SVM, 4-grams, Lemmatization

In [28]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(lemmatized_reviews_train)
X = ngram_vectorizer.transform(lemmatized_reviews_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8709276299428003
Accuracy for C=0.05: 0.8776423775180303
Accuracy for C=0.25: 0.870430241233524
Accuracy for C=0.5: 0.8639641880129321
Accuracy for C=1: 0.8584929122108929


## SVM, 4-grams, Stemming, Removal of stop words

In [29]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [30]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4), stop_words=stop_words)
ngram_vectorizer.fit(stemmed_reviews_train)
X = ngram_vectorizer.transform(stemmed_reviews_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8333747823924397
Accuracy for C=0.05: 0.8567520517284257
Accuracy for C=0.25: 0.8475503606068142
Accuracy for C=0.5: 0.8425764735140512
Accuracy for C=1: 0.8398408356130316


## SVM, 4-grams, Lemmatization, Removal of stop words

In [31]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4), stop_words=stop_words)
ngram_vectorizer.fit(lemmatized_reviews_train)
X = ngram_vectorizer.transform(lemmatized_reviews_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8209400646605322
Accuracy for C=0.05: 0.8607311614026362
Accuracy for C=0.25: 0.8577468291469784
Accuracy for C=0.5: 0.8550111912459587
Accuracy for C=1: 0.8490425267346431


# Polishing the Model

In [32]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

ccc = []
c_scores = []

for c in np.arange(0.01, 0.1, 0.01):
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    ccc.append(c)
    c_scores.append(accuracy_score(y_val, svm.predict(X_val)))   

Accuracy for C=0.01: 0.8639641880129321
Accuracy for C=0.02: 0.8734145734891818
Accuracy for C=0.03: 0.8768962944541159
Accuracy for C=0.04: 0.8776423775180303
Accuracy for C=0.05: 0.8778910718726685
Accuracy for C=0.060000000000000005: 0.8798806267097737
Accuracy for C=0.06999999999999999: 0.8778910718726685
Accuracy for C=0.08: 0.8761502113902014
Accuracy for C=0.09: 0.8749067396170107


In [33]:
import matplotlib.pyplot as plt    
                    
plt.plot(ccc, c_scores)
plt.show()

<Figure size 640x480 with 1 Axes>

# Final Model - SVM, 4-grams, C = 6

# Let's test this baby out!

In [34]:
final = LinearSVC(tol=.000001,C=0.05)
final.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final.predict(X_test)))

Final Accuracy: 0.9112161153941806
