# Dependencies

In [1]:
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Importing and Cleaning Data

In [2]:
df = pd.read_json("Data_cyb.json", lines = True, orient = "columns")

In [3]:
rating = []

for i in df["annotation"]:
    rating.append(int(i["label"][0]))
    
df["rating"] = rating

In [4]:
df.head()

Unnamed: 0,content,annotation,extras,rating
0,Get fucking real dude.,"{'notes': '', 'label': ['1']}",,1
1,She is as dirty as they come and that crook ...,"{'notes': '', 'label': ['1']}",,1
2,why did you fuck it up. I could do it all day...,"{'notes': '', 'label': ['1']}",,1
3,Dude they dont finish enclosing the fucking s...,"{'notes': '', 'label': ['1']}",,1
4,WTF are you talking about Men? No men thats n...,"{'notes': '', 'label': ['1']}",,1


In [5]:
tweets = pd.read_csv("Test_Twitter_Comments.csv")
tweets.tail()

Unnamed: 0,content,rating
96,That is someone who does it from their heart. ...,1
97,Absolutely applaud your work to secure freedom...,0
98,You'll never learn it till you actually live i...,1
99,Nothing on the reinstatement of federal Capito...,1
100,Crickets,0


In [6]:
new_df1 = df[["content", "rating"]]

In [7]:
new_df = pd.concat([new_df1,tweets])

In [8]:
X, X_test, y, y_test = train_test_split(new_df["content"], new_df["rating"], train_size = 0.8)


In [9]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(X)
reviews_test_clean = preprocess_reviews(X_test)

# Baseline Model

Logistic regression model with just the removal of negligible symbols.

In [10]:
baseline_vectorizer = CountVectorizer(binary=True)
baseline_vectorizer.fit(reviews_train_clean)
X_baseline = baseline_vectorizer.transform(reviews_train_clean)
X_test_baseline = baseline_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X_baseline, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6856503357373788
Accuracy for C=0.05: 0.734145734891818
Accuracy for C=0.25: 0.7878637154936583
Accuracy for C=0.5: 0.8030340711265854
Accuracy for C=1: 0.8211887590151704


### Has room to learn

# Remove Stop Words

In [11]:
from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

In [12]:
no_stop_words_train = remove_stop_words(reviews_train_clean)
no_stop_words_test = remove_stop_words(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(no_stop_words_train)
X = cv.transform(no_stop_words_train)
X_test = cv.transform(no_stop_words_test)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6647600099477742
Accuracy for C=0.05: 0.7222084058691868
Accuracy for C=0.25: 0.7774185525988561
Accuracy for C=0.5: 0.7978114896791842
Accuracy for C=1: 0.8137279283760258


### Still has room to learn

# Normalization

# Stemming

In [13]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = get_stemmed_text(reviews_train_clean)
stemmed_reviews_test = get_stemmed_text(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(stemmed_reviews_train)
X = cv.transform(stemmed_reviews_train)
X_test = cv.transform(stemmed_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6836607809002736
Accuracy for C=0.05: 0.7289231534444168
Accuracy for C=0.25: 0.7709524993782642
Accuracy for C=0.5: 0.7888584929122109
Accuracy for C=1: 0.8027853767719473


### Still has room to learn

# Lemmatization

In [14]:
def get_lemmatized_text(corpus):
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)
lemmatized_reviews_test = get_lemmatized_text(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(lemmatized_reviews_train)
X = cv.transform(lemmatized_reviews_train)
X_test = cv.transform(lemmatized_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6781895050982343
Accuracy for C=0.05: 0.725938821188759
Accuracy for C=0.25: 0.7731907485700075
Accuracy for C=0.5: 0.7963193235513554
Accuracy for C=1: 0.8167122606316837


### Still has room to learn

# Word Counts

In [15]:
wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)
X_test = wc_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75, 
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.6682417309127083
Accuracy for C=0.05: 0.7194727679681672
Accuracy for C=0.25: 0.7769211638895797
Accuracy for C=0.5: 0.8007958219348421
Accuracy for C=1: 0.8226809251429993


### Still has room to learn

# Term Frequency-Inverse Document Frequency (TF-IDF)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.6058194478985327
Accuracy for C=0.05: 0.646107933349913
Accuracy for C=0.25: 0.7127580203929371
Accuracy for C=0.5: 0.7408604824670479
Accuracy for C=1: 0.7699577219597115




### Still has room to learn

# Naive Bayes

In [17]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("basics").getOrCreate()

In [18]:
from pyspark import SparkFiles

# Load in user_data.csv from S3 into a DataFrame
spark = SparkSession.builder.master("local").appName("CsvReader").getOrCreate()
spark_df = spark.read.format("csv").option("header", "true").load(r"./final_df.csv")

In [19]:
from pyspark.sql.types import IntegerType

spark_df = spark_df.withColumn("rating1", spark_df["rating"].cast(IntegerType()))
spark_df = spark_df.drop(spark_df.rating)
spark_df = spark_df.withColumnRenamed("rating1", "rating")

In [20]:
from pyspark.sql.functions import regexp_extract, length
review_df = spark_df.withColumnRenamed("rating", "label")\
      .withColumnRenamed("content", "review_text")\
      .select(["label", "review_text"])
review_df = review_df.withColumn('review_length', length(review_df['review_text'])).dropna()
review_df.cache()

DataFrame[label: int, review_text: string, review_length: int]

In [21]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Create all the features to the data set
tokenizer = Tokenizer(inputCol="review_text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="token_text", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [22]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'review_length'], outputCol='features')

In [23]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

In [24]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(review_df)
cleaned = cleaner.transform(review_df)

In [25]:
# Break data down into a training set and a testing set
from pyspark.ml.classification import NaiveBayes
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [26]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+-----+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|         review_text|review_length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|             *RAWR*?|            9|       [, , *rawr*?]|       [, , *rawr*?]|(262144,[249180,2...|(262144,[249180,2...|(262145,[249180,2...|[-144.56509102314...|[0.41089407023813...|       1.0|
|    0|                   .|            3|             [, , .]|             [, , .]|(262144,[1536,249...|(262144,[1536,249...|(262145,[1536,249...|[-55.987376438846...|[0.99810405076557...|       

In [27]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.806893


# N-grams

In [28]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.7450882864958965
Accuracy for C=0.05: 0.845560805769709
Accuracy for C=0.25: 0.8826162646107933
Accuracy for C=0.5: 0.8846058194478985
Accuracy for C=1: 0.8863466799303655


### Still has room to learn, but it seems to be plateauing

# Support Vector Machines (SVM)

In [29]:
from sklearn.svm import LinearSVC

### n = 2

In [33]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.818950509823427
Accuracy for C=0.05: 0.8659537428500373
Accuracy for C=0.25: 0.8679432976871425
Accuracy for C=0.5: 0.8659537428500373
Accuracy for C=1: 0.8617259388211888


### n = 3

In [34]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.842327779159413
Accuracy for C=0.05: 0.8664511315593136
Accuracy for C=0.25: 0.8619746331758269
Accuracy for C=0.5: 0.8589903009201691
Accuracy for C=1: 0.8562546630191494


### n = 4

In [32]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8634667993036558
Accuracy for C=0.05: 0.8756528226809251
Accuracy for C=0.25: 0.8691867694603332
Accuracy for C=0.5: 0.8614772444665506
Accuracy for C=1: 0.8560059686645113




#### The accuracy is now showing peaks. Multiple rounds will be conducted and then averaged to see which n value will yield the greatest accuracy.

In [41]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

list2 = []
for i in range(10):
    list1 = []
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75, random_state = i)
    
    for c in [0.01, 0.05, 0.25, 0.5, 1]:
        svm = LinearSVC(C=c)
        svm.fit(X_train, y_train)
        list1.append(accuracy_score(y_val, svm.predict(X_val)))
    
    list2.append(list1)

# a = 0

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
#     print ("Accuracy for C=%s: %s" 
#            % (c, np.sum(list2[:][a]) / len(list2)))
#     a += 1


np.sum(list2[:][0])

4.277791594130814

### n = 3, stemming

In [79]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(stemmed_reviews_train)
X = ngram_vectorizer.transform(stemmed_reviews_train)
X_test = ngram_vectorizer.transform(stemmed_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8463068888336235
Accuracy for C=0.05: 0.8639641880129321
Accuracy for C=0.25: 0.8579955235016166
Accuracy for C=0.5: 0.853767719472768
Accuracy for C=1: 0.8458095001243472


### n = 3, lemmatization

In [80]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(lemmatized_reviews_train)
X = ngram_vectorizer.transform(lemmatized_reviews_train)
X_test = ngram_vectorizer.transform(lemmatized_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.8515294702810247
Accuracy for C=0.05: 0.8751554339716489
Accuracy for C=0.25: 0.8699328525242477
Accuracy for C=0.5: 0.8642128823675702
Accuracy for C=1: 0.8592389952748073


In [23]:
new_df.to_csv("./final_df.csv", index=False)

In [61]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(stemmed_reviews_train)
X = ngram_vectorizer.transform(stemmed_reviews_train)
X_test = ngram_vectorizer.transform(stemmed_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.5
)

ccc = []
c_scores = []

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    ccc.append(c)
    c_scores.append(accuracy_score(y_val, svm.predict(X_val)))   

Accuracy for C=0.04: 0.8205447083696058
Accuracy for C=0.05: 0.8211665215769183
Accuracy for C=0.06: 0.8215396095013058
Accuracy for C=0.07: 0.8207934336525308
Accuracy for C=0.08: 0.8224101479915433
Accuracy for C=0.09: 0.8215396095013058
Accuracy for C=0.1: 0.8206690710110682
Accuracy for C=0.11: 0.8209177962939933
Accuracy for C=0.12: 0.8204203457281433
Accuracy for C=0.13: 0.8200472578037558
Accuracy for C=0.14: 0.8196741698793683
Accuracy for C=0.15: 0.8196741698793683


In [63]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(lemmatized_reviews_train)
X = ngram_vectorizer.transform(lemmatized_reviews_train)
X_test = ngram_vectorizer.transform(lemmatized_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.5
)

ccc = []
c_scores = []

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    ccc.append(c)
    c_scores.append(accuracy_score(y_val, svm.predict(X_val)))   

Accuracy for C=0.04: 0.8225345106330059
Accuracy for C=0.05: 0.8199228951622932
Accuracy for C=0.06: 0.8190523566720557
Accuracy for C=0.07: 0.8170625544086556
Accuracy for C=0.08: 0.8169381917671932
Accuracy for C=0.09: 0.8165651038428057
Accuracy for C=0.1: 0.8165651038428057
Accuracy for C=0.11: 0.8171869170501181
Accuracy for C=0.12: 0.8171869170501181
Accuracy for C=0.13: 0.8164407412013431
Accuracy for C=0.14: 0.8150727521452555
Accuracy for C=0.15: 0.8143265762964805


In [93]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

ccc = []
c_scores = []

for c in [0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    ccc.append(c)
    c_scores.append(accuracy_score(y_val, svm.predict(X_val)))   

Accuracy for C=0.04: 0.8674459089778662
Accuracy for C=0.05: 0.8706789355881621
Accuracy for C=0.06: 0.8701815468788858
Accuracy for C=0.07: 0.8714250186520766
Accuracy for C=0.08: 0.8726684904252674
Accuracy for C=0.09: 0.8726684904252674
Accuracy for C=0.1: 0.8731658791345437
Accuracy for C=0.11: 0.87366326784382
Accuracy for C=0.12: 0.8724197960706292
Accuracy for C=0.13: 0.8716737130067147
Accuracy for C=0.14: 0.8709276299428003
Accuracy for C=0.15: 0.8706789355881621


In [None]:
### n = 3, lemmatization

In [94]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

ccc = []
c_scores = []

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    ccc.append(c)
    c_scores.append(accuracy_score(y_val, svm.predict(X_val)))   

Accuracy for C=0.04: 0.855508579955235
Accuracy for C=0.05: 0.8557572743098731
Accuracy for C=0.06: 0.855508579955235
Accuracy for C=0.07: 0.8547624968913206
Accuracy for C=0.08: 0.8545138025366824
Accuracy for C=0.09: 0.853767719472768
Accuracy for C=0.1: 0.8545138025366824
Accuracy for C=0.11: 0.8545138025366824
Accuracy for C=0.12: 0.853767719472768
Accuracy for C=0.13: 0.8535190251181298
Accuracy for C=0.14: 0.8560059686645113
Accuracy for C=0.15: 0.8557572743098731


# Final Model

In [None]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(lemmatized_reviews_train)
X = ngram_vectorizer.transform(lemmatized_reviews_train)
X_test = ngram_vectorizer.transform(lemmatized_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

ccc = []
c_scores = []

for c in [0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    ccc.append(c)
    c_scores.append(accuracy_score(y_val, svm.predict(X_val)))   

In [66]:
final_svm_ngram = LinearSVC(C=0.06)
final_svm_ngram.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_svm_ngram.predict(X_test)))

Final Accuracy: 0.8164635662770455


In [39]:
import matplotlib.pyplot as plt    
                    
plt.plot(ccc, c_scores)
plt.show()

<Figure size 640x480 with 1 Axes>

# Let's test this baby out!

In [40]:
final = LinearSVC(tol=.000001,C=0.07)
final.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final.predict(X_test)))

Final Accuracy: 0.8995274807261875


In [41]:
tweets_list = [i[1]["content"] for i in tweets.iterrows()]
rating_list = [i[1]["rating"] for i in tweets.iterrows()]

In [42]:
twitter_cleaned = preprocess_reviews(tweets_list)
len(twitter_cleaned)

101

In [43]:
tws = ngram_vectorizer.transform(twitter_cleaned)
tws.shape

(101, 137709)

In [44]:
predictions = final.predict(tws[:100])

In [45]:
pd.DataFrame({"Prediction": predictions, "Actual": rating_list[:100]}).reset_index(drop=True).head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,1
3,1,1
4,1,1


In [46]:
total = 0
score = 0
for i, j in zip(predictions, rating_list):
    total += 1
    if i == j:
        score += 1
        
print(f"Accuracy: {score/total}")

Accuracy: 0.88


# Save & Load Model

In [47]:
from sklearn.externals import joblib



In [48]:
joblib.dump(final, "final_model_svc.pkl")

['final_model_svc.pkl']

In [49]:
try:
    retrieve_model = joblib.load("final_model_svc.pkl")
    print("using trained model")
except:
    print("model not found")
    joblib.dump(final, "final_model_svc.pkl")

using trained model
