# CS506 Midterm
### Name: Andrew Tuckman
### BUID: U40643751

## Imports:

In [7]:
import re                   # String processing
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix

## Load files into DataFrames:

In [8]:
train = pd.read_csv("./data/train.csv")

## Feature extraction:

In [9]:
train = train[['Score','Text']]
print(train)

         Score                                               Text
0          4.0  This is a charming version of the classic Dick...
1          3.0  It was good but not as emotionally moving as t...
2          3.0  Don't get me wrong, Winkler is a wonderful cha...
3          5.0  Henry Winkler is very good in this twist on th...
4          4.0  This is one of the best Scrooge movies out.  H...
...        ...                                                ...
1697528    NaN  wow $269.99 for the entire series on Blu Ray??...
1697529    5.0  Finally, the holy grail of tv-on-dvd boxsets i...
1697530    5.0  Could this be a true or I'm i dreaming batman ...
1697531    5.0  I've been a fan of the series since I was a yo...
1697532    5.0  People seriously need to wake up and realize t...

[1697533 rows x 2 columns]


## Drop null values:

In [10]:
train = train.dropna()

In [11]:
print(train)

         Score                                               Text
0          4.0  This is a charming version of the classic Dick...
1          3.0  It was good but not as emotionally moving as t...
2          3.0  Don't get me wrong, Winkler is a wonderful cha...
3          5.0  Henry Winkler is very good in this twist on th...
4          4.0  This is one of the best Scrooge movies out.  H...
...        ...                                                ...
1697526    4.0  Looking very much forward to this release, but...
1697529    5.0  Finally, the holy grail of tv-on-dvd boxsets i...
1697530    5.0  Could this be a true or I'm i dreaming batman ...
1697531    5.0  I've been a fan of the series since I was a yo...
1697532    5.0  People seriously need to wake up and realize t...

[1397480 rows x 2 columns]


## - Text Pre-processing -
### Convert string sentences to list of string words, and remove characters:

In [12]:
expr_pattern = re.compile(pattern = r'[^\w\s]+', flags = 0)
train['Text'] = [expr_pattern.sub('', word) for word in train['Text'].tolist()]

### Convert string words to lowercase:

In [13]:
train['Text'] = train['Text'].str.lower().str.split()

In [14]:
print(train)

         Score                                               Text
0          4.0  [this, is, a, charming, version, of, the, clas...
1          3.0  [it, was, good, but, not, as, emotionally, mov...
2          3.0  [dont, get, me, wrong, winkler, is, a, wonderf...
3          5.0  [henry, winkler, is, very, good, in, this, twi...
4          4.0  [this, is, one, of, the, best, scrooge, movies...
...        ...                                                ...
1697526    4.0  [looking, very, much, forward, to, this, relea...
1697529    5.0  [finally, the, holy, grail, of, tvondvd, boxse...
1697530    5.0  [could, this, be, a, true, or, im, i, dreaming...
1697531    5.0  [ive, been, a, fan, of, the, series, since, i,...
1697532    5.0  [people, seriously, need, to, wake, up, and, r...

[1397480 rows x 2 columns]


### Remove stop words:

In [15]:
stopwords_list = ['i', "i've", "ive" 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "youre", "you've", "youve", "you'll", "you'd", "youll", "youd", 'your', 'yours', 'yourself', 
'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", "shes", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 
'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", "thatll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 
'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 
'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 
'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", "dont", 'should', "should've", "shouldve",
'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", "arent", 'couldn', "couldn't", "couldnt", 'didn', "didn't", "didnt", 'doesn', "doesn't", "doesnt", 'hadn', "hadn't", "hadnt", 'hasn', 
"hasn't", "hasnt", 'haven', "haven't", "havent", 'isn', "isn't", "isnt", 'ma', 'mightn', "mightn't", "mightnt", 'mustn', "mustn't", "mustnt", 'needn', "needn't", "neednt", 'shan', "shan't", "shant", 'shouldn', "shouldn't", "shouldnt", 
'wasn', "wasn't", "wasnt", 'weren', "weren't", "werent", 'won', "won't", "wont", 'wouldn', "wouldn't", "wouldnt"]

In [16]:
train['Text'] = train['Text'].apply(lambda review: [word for word in review if word not in stopwords_list])

In [17]:
print(train)

         Score                                               Text
0          4.0  [charming, version, classic, dickens, tale, he...
1          3.0  [good, emotionally, moving, christmas, carol, ...
2          3.0  [get, wrong, winkler, wonderful, character, ac...
3          5.0  [henry, winkler, good, twist, classic, story, ...
4          4.0  [one, best, scrooge, movies, henry, winkler, o...
...        ...                                                ...
1697526    4.0  [looking, much, forward, release, price, bit, ...
1697529    5.0  [finally, holy, grail, tvondvd, boxsets, comin...
1697530    5.0  [could, true, im, dreaming, batman, favorite, ...
1697531    5.0  [ive, fan, series, since, young, boy, personal...
1697532    5.0  [people, seriously, need, wake, realize, cant,...

[1397480 rows x 2 columns]


### Convert list back into string:

In [20]:
train['Text'] = train['Text'].apply(', '.join)
p1 = re.compile(r'[^\w\s]+')
train['Text'] = [p1.sub('', x) for x in train['Text'].tolist()]
train['Text'] = train['Text'].str.lower()

In [21]:
print(train)

         Score                                               Text
0          4.0  charming version classic dickens tale henry wi...
1          3.0  good emotionally moving christmas carol dicken...
2          3.0  get wrong winkler wonderful character actor wo...
3          5.0  henry winkler good twist classic story convent...
4          4.0  one best scrooge movies henry winkler outdoes ...
...        ...                                                ...
1697526    4.0  looking much forward release price bit outrage...
1697529    5.0  finally holy grail tvondvd boxsets coming blur...
1697530    5.0  could true im dreaming batman favorite comic b...
1697531    5.0  ive fan series since young boy personaly consi...
1697532    5.0  people seriously need wake realize cant get bl...

[1397480 rows x 2 columns]


In [22]:
X_train = train['Text']
y_train = train['Score']

0    4.0
1    3.0
2    3.0
3    5.0
4    4.0
Name: Score, dtype: float64

In [None]:
# test = pd.read_csv("./data/test.csv")
# sample = pd.read_csv("./data/sample.csv")