In [1]:
import re
import pickle
import numpy as np
import pandas as pd

In [2]:
cols = ["target", "ids", "date", "flag", "user", "text"]
tweets_df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1", names=cols)

In [3]:
def clean_text(text):
    #Removes twitter username, hashtags, numbers, and links.
    text = re.sub("@\S+|#\S+|\d+|https?:\S+|http?:\S+", '', str(text))
    return text

In [4]:
tweets_df.text = tweets_df.text.apply(lambda x: clean_text(x))
tweets_df.text.head(10)

0      - Awww, that's a bummer.  You shoulda got Da...
1    is upset that he can't update his Facebook by ...
2     I dived many times for the ball. Managed to s...
3      my whole body feels itchy and like its on fire 
4     no, it's not behaving at all. i'm mad. why am...
5                                  not the whole crew 
6                                          Need a hug 
7     hey  long time no see! Yes.. Rains a bit ,onl...
8                            nope they didn't have it 
9                                      que me muera ? 
Name: text, dtype: object

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tweets_df.text, tweets_df.target, test_size=0.2, random_state=1)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words="english")
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

print('X_train_cv: ', X_train_cv.shape)
print('X_test_cv: ', X_test_cv.shape)

X_train_cv:  (1280000, 233195)
X_test_cv:  (320000, 233195)


In [7]:
from sklearn.feature_selection import SelectPercentile, chi2

sp = SelectPercentile(chi2, percentile=50)
X_train_cv_small = sp.fit_transform(X_train_cv, y_train)
X_test_cv_small = sp.transform(X_test_cv)

In [8]:
print('X_train_cv_small: ', X_train_cv_small.shape)
print('X_test_cv_small: ', X_test_cv_small.shape)

X_train_cv_small:  (1280000, 116597)
X_test_cv_small:  (320000, 116597)


In [9]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

linear_svc = LinearSVC(random_state=1, C=0.1, max_iter=1000)
linear_svc.fit(X_train_cv_small, y_train)
y_pred = linear_svc.predict(X_test_cv_small)
print("Linear SVC Test Accuracy: {}\n".format(accuracy_score(y_test, y_pred)))

log_reg = LogisticRegression(random_state=1, C=1.0, max_iter=1000, n_jobs=-1)
log_reg.fit(X_train_cv_small, y_train)
y_pred = log_reg.predict(X_test_cv_small)
print("Logistic Regression Test Accuracy: {}\n".format(accuracy_score(y_test, y_pred)))

sgd = SGDClassifier(random_state=1, max_iter=1000, n_jobs=-1)
sgd.fit(X_train_cv_small, y_train)
y_pred = sgd.predict(X_test_cv_small)
print("SGD Classifier Test Accuracy: {}\n".format(accuracy_score(y_test, y_pred)))

rf = RandomForestClassifier(random_state=1, max_depth=30, n_estimators=250, n_jobs=-1)
rf.fit(X_train_cv_small, y_train)
y_pred = rf.predict(X_test_cv_small)
print("Random Forest Test Accuracy: {}\n".format(accuracy_score(y_test, y_pred)))

xgb = XGBClassifier(random_state=1, learning_rate=0.1, n_estimators=250, n_jobs=-1)
xgb.fit(X_train_cv_small, y_train)
y_pred = xgb.predict(X_test_cv_small)
print("XGBoost Test Accuracy: {}".format(accuracy_score(y_test, y_pred)))

Linear SVC Test Accuracy: 0.777215625





Logistic Regression Test Accuracy: 0.77799375

SGD Classifier Test Accuracy: 0.76473125

Random Forest Test Accuracy: 0.74325625

XGBoost Test Accuracy: 0.726815625


In [10]:
def predict(text, model):
    cleaned_text = clean_text(text)
    text_cv = cv.transform([cleaned_text])
    text_cv_small = sp.transform(text_cv)
    pred = model.predict(text_cv_small)[0]
    return "Negative" if pred == 0 else "Positive"

In [11]:
random_tweet = "I suggest you follow @loganbonner if you’re into #ttrpg. He retweets a lot of interesting stuff, and is a good dude. #ff"
predict(random_tweet, log_reg)

'Positive'

In [12]:
pickle.dump(log_reg, open("log_regr_model.pkl", 'wb'))
pickle.dump(cv, open("count_vectorizer.pkl", 'wb'))
pickle.dump(sp, open("select_percentile.pkl", 'wb'))