# I. Training sentiment analysis model on Imdb reviews

## 1, Packages

In [1]:
import urllib.request as req
import tarfile
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bghorvath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2, Getting the data

In [None]:
imdb_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

save_filename = "aclImdb_v1.tar.gz"
if not os.path.exists(save_filename):
    req.urlretrieve(imdb_url, save_filename)

imdb_folder = "aclImdb"
if not os.path.exists(imdb_folder):
    with tarfile.open(save_filename) as tar:
        tar.extractall()

In [4]:
basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)

df.columns = ['review', 'sentiment']

np.random.seed(0)

df = df.reindex(np.random.permutation(df.index))
df.to_csv('IMDb_Reviews.csv', index=False, encoding='utf-8')

## 3, Read data

In [14]:
df = pd.read_csv('IMDb_Reviews.csv')

y = df.sentiment.values
X = df.review.values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, 
                                                    shuffle=False)

## 4, Model

In [3]:
ps = PorterStemmer()

sw = stopwords.words('english')

def tokenizer_porter(text):
    tokens = [ps.stem(word) for word in text.split()]
    return [w for w in tokens if not w in sw]

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [4]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False, 
                        preprocessor=preprocessor, 
                        tokenizer=tokenizer_porter, 
                        use_idf=True, norm='l2',
                        smooth_idf=True)

In [5]:
logreg = LogisticRegressionCV(cv=9,
                          scoring='accuracy',
                          random_state=0,
                           n_jobs=-1,
                           verbose=3,
                          max_iter=300)

clf = make_pipeline(tfidf, logreg)
clf.fit(X_train, y_train)

saved_model = open('clf_model1.sav','wb')
pickle.dump(clf, saved_model)
saved_model.close()

## 5, Evaluation

In [6]:
filename = 'clf_model1.sav'
clf = pickle.load(open(filename,'rb'))

In [19]:
clf.score(X_test, y_test)

0.9002

# II. Inference on tweets

## 1, Packages (again)

In [10]:
import urllib.request as req
import tarfile
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
import pickle
import nest_asyncio
import twint
import csv
from datetime import date
from datetime import timedelta

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bghorvath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1, Imdb dataset

In [11]:
nest_asyncio.apply()
pd.set_option('display.max_columns', None)

movie_data = pd.read_csv('processed_IMDb_movies.csv')

movie_data['usa_gross_income'] = movie_data['usa_gross_income'].str.replace('$','').replace('NaN',np.nan).astype(float)

filtered_movies = movie_data[(movie_data['year']==2018) & (movie_data['country']=='USA') & (movie_data['usa_gross_income'] > 1000000)].loc[:][['imdb_title_id','original_title','year','date_published','usa_gross_income']].reset_index()

del filtered_movies['index']

  movie_data['usa_gross_income'] = movie_data['usa_gross_income'].str.replace('$','').replace('NaN',np.nan).astype(float)


## 2, Fetch tweets about selected movies

In [24]:
def add_days(iso_date,days):

    d = date.fromisoformat(iso_date)
    d_plus_days = d + timedelta(days=days)
    
    return d_plus_days.isoformat()

def run_twint(dataframe):
    #for i in range(len(dataframe)):
    for i in range(3):
        c = twint.Config()
        c.Search = dataframe['original_title'][i]
        c.Lang = 'en'
        c.Limit = 100
        c.Output = 'tweets/{}.csv'.format(i)
        c.Email = False
        c.Custom['tweet'] = ['tweet']
        c.Phone = False
        c.Links = 'exclude'
        c.Pandas_clean = True
        c.Filter_retweets = True
        c.Hide_output = True
        c.Store_csv = True
        c.Since = dataframe['date_published'][i]
        c.Until = add_days(dataframe['date_published'][i],3)
        twint.run.Search(c)
    return

In [25]:
run_twint(filtered_movies)

[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.


## 3, Sentiment analysis on tweets

### Loading model

In [12]:
ps = PorterStemmer()

sw = stopwords.words('english')

def tokenizer_porter(text):
    tokens = [ps.stem(word) for word in text.split()]
    return [w for w in tokens if not w in sw]

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False, 
                        preprocessor=preprocessor, 
                        tokenizer=tokenizer_porter, 
                        use_idf=True, norm='l2',
                        smooth_idf=True)

logreg = LogisticRegressionCV(cv=9,
                          scoring='accuracy',
                          random_state=0,
                           n_jobs=-1,
                           verbose=3,
                          max_iter=300)

filename = 'clf_model1.sav'
clf = pickle.load(open(filename,'rb'))

In [13]:
import statistics
from pathlib import Path

sentiment_dataset = pd.DataFrame({'sentiment':[]}, columns = ['sentiment'])

In [14]:
directory = 'tweets/'
for filename in os.listdir(directory):
    fn = os.path.join(directory, filename)
    print(fn)
    p = Path(fn)
    with open(fn, 'r') as csvfile:
        reader = csv.reader(csvfile)
        a = []
        for row in reader:
            a.append(int(clf.predict(row)))
        sentiment_dataset.loc[p.stem] = [statistics.mode(a)]

tweets/0.csv
tweets/5.csv
tweets/1.csv
tweets/2.csv


In [15]:
sentiment_dataset

Unnamed: 0,sentiment
0,0.0
5,0.0
1,1.0
2,0.0
