In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import regex as re
import string
from collections import defaultdict

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
dirty_df = pd.read_csv('data/advanced_trainset.csv')

In [None]:
dirty_df.Sentiment.value_counts().plot(kind='bar')

In [None]:
def clean_sentence(s):
    # To lowercase
    s = s.lower()

    # Remove apostrophes
    s = re.sub(' \'s', '', s)

    # Fix % and $ whitespace
    s = re.sub('(?<=\d) %', '%', s)
    s = re.sub('\$ (?=\d)', '$', s)

    # Remove links
    s = re.sub('http\S+', ' ', s)

    # Remove .'s not surrounded by numbers
    s = re.sub('(?<!\d)\.|\,(?!\d)', ' ', s)

    # Remove punctuation
    s = re.sub('-|\(|\)', ' ', s)
    s = re.sub('\'|\,|\`', '', s)

    # Remove extra whitespace
    s = re.sub(' +', ' ', s)

    return s

In [None]:
df = dirty_df.copy(deep=True)
df['Sentence'] = df['Sentence'].apply(clean_sentence)
df["Sentiment"] = df['Sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1})
df

In [None]:
def create_ngrams(s, n):
    words = s.split()
    ngrams = []
    for i in range(n):
        to_zip = []
        for j in range(i + 1):
            start = j
            end = (-i + j)

            if start == 0 and end == 0:
                ngrams.append(words)
            elif start == 0:
                to_zip.append(words[:end])
            elif end == 0:
                to_zip.append(words[start:])
            else:
                to_zip.append(words[start:end])
        if i > 0:
            ngrams.append([' '.join(x) for x in list(zip(*to_zip))])
        
    return sum(ngrams, [])

create_ngrams('a b c d', 3)

In [None]:
df['ngrams'] = df['Sentence'].apply(create_ngrams, n=3)
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.ngrams, df.Sentiment)

In [None]:
wordCount = defaultdict(int)
for s in df.Sentence:
    for w in create_ngrams(s, 3):
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [None]:
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))

In [None]:
def feature(ng):
    feat = [0]*len(words)
    for w in ng:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [None]:
X_train = [feature(ng) for ng in X_train]
X_test = [feature(ng) for ng in X_test]

In [None]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
preds = clf.predict(X_test)

In [None]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [None]:
print("1,000 most common n-grams")
print("--------------------------")
print("MSE: " + str(sum((y_test - preds)**2)/len(y_test)))
print()
print("most negative n-grams")
[print(w) for w in wordSort[:5]]
print()
print("most positive n-grams")
[print(w) for w in wordSort[:-6:-1]]
print()

In [None]:
# Regularized regression
clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
clf.score(X_test, y_test)