# Importing

In [16]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

import string
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
from imblearn.over_sampling import ADASYN

import re

[nltk_data] Downloading package punkt to /Users/moseslin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
pd.set_option('max_colwidth', None)

In [4]:
df = pd.read_excel('twitterdata.xlsx')
df.head()

Unnamed: 0,TweetID,Weekday,Hour,Day,Lang,IsReshare,Reach,RetweetCount,Likes,Klout,Sentiment,text,LocationID,UserID
0,tw-682712873332805633,Thursday,17.0,31.0,en,0.0,44.0,0.0,0.0,35.0,0.0,We are hiring: Senior Software Engineer - Proto http://www.reqcloud.com/jobs/719865/?k=0LaPxXuFwczs1e32ZURJKrgCIDMQtRO7BquFSQthUKY&utm_source=twitter&utm_campaign=reqCloud_JobPost #job @awscloud #job #protocol #networking #aws #mediastreaming,3751.0,tw-40932430
1,tw-682713045357998080,Thursday,17.0,31.0,en,1.0,1810.0,5.0,0.0,53.0,2.0,RT @CodeMineStatus: This is true Amazon Web Services https://aws.amazon.com/ #php #html #html5 #css #webdesign #seo #java #javascript htt,3989.0,tw-3179389829
2,tw-682713219375476736,Thursday,17.0,31.0,en,0.0,282.0,0.0,0.0,47.0,0.0,Devops Engineer Aws Ansible Cassandra Mysql Ubuntu Ruby On Rails Jobs in Austin TX #Austin #TX #jobs #jobsearch https://www.jobfindly.com/devops-engineer-aws-ansible-cassandra-mysql-ubuntu-ruby-on-rails-jobs-austin-tx.html,3741.0,tw-4624808414
3,tw-682713436967579648,Thursday,17.0,31.0,en,0.0,2087.0,4.0,0.0,53.0,0.0,Happy New Year to all those AWS instances of ours!,3753.0,tw-356447127
4,tw-682714048199311366,Thursday,17.0,31.0,en,0.0,953.0,0.0,0.0,47.0,0.0,Amazon is hiring! #Sr. #International Tax Manager - AWS in #Seattle apply now! #jobs http://neuvoo.com/job.php?id=dsvkrujig3&source=twitter&lang=en&client_id=658&l=Seattle%20Washington%20US&k=Sr.%20International%20Tax%20Manager%20-%20AWS http://twitter.com/NeuvooAccSea/status/682714048199311366/photo/1,3751.0,tw-3172686669


# Cleaning

In [5]:
# Re-naming so they are more manageable

df.columns = ['tweetid', 'weekday', 'hour', 'day', 'lang', 'isreshare', 'reach', 'retweetcount', 'likes', 'klout', 'sentiment', 'text', 'locationid', 'userid']
df.columns

Index(['tweetid', 'weekday', 'hour', 'day', 'lang', 'isreshare', 'reach',
       'retweetcount', 'likes', 'klout', 'sentiment', 'text', 'locationid',
       'userid'],
      dtype='object')

In [6]:
# Majority of tweets are in english.

df = df.loc[df['lang'] == 'en']
df['lang'].value_counts()

en    91886
Name: lang, dtype: int64

In [7]:
# Removing null values as they mess up code

df['text'].dropna(inplace=True)

In [8]:
# Looking at what we are working with

list(df['text'])

['We are hiring: Senior Software Engineer - Proto http://www.reqcloud.com/jobs/719865/?k=0LaPxXuFwczs1e32ZURJKrgCIDMQtRO7BquFSQthUKY&utm_source=twitter&utm_campaign=reqCloud_JobPost #job @awscloud #job #protocol #networking #aws #mediastreaming',
 'RT @CodeMineStatus: This is true Amazon Web Services https://aws.amazon.com/ #php #html #html5 #css #webdesign #seo #java #javascript htt',
 'Devops Engineer Aws Ansible Cassandra Mysql Ubuntu Ruby On Rails Jobs in Austin TX #Austin #TX #jobs #jobsearch https://www.jobfindly.com/devops-engineer-aws-ansible-cassandra-mysql-ubuntu-ruby-on-rails-jobs-austin-tx.html',
 'Happy New Year to all those AWS instances of ours!',
 'Amazon is hiring! #Sr. #International Tax Manager - AWS in #Seattle apply now! #jobs http://neuvoo.com/job.php?id=dsvkrujig3&source=twitter&lang=en&client_id=658&l=Seattle%20Washington%20US&k=Sr.%20International%20Tax%20Manager%20-%20AWS http://twitter.com/NeuvooAccSea/status/682714048199311366/photo/1',
 '#AWS bc of per-regi

# Getting Links

In [11]:
url = df['text'].str.findall(r'(http\S+)')
url.head(6)

0                                                                                            [http://www.reqcloud.com/jobs/719865/?k=0LaPxXuFwczs1e32ZURJKrgCIDMQtRO7BquFSQthUKY&utm_source=twitter&utm_campaign=reqCloud_JobPost]
1                                                                                                                                                                                                        [https://aws.amazon.com/]
2                                                                                                                 [https://www.jobfindly.com/devops-engineer-aws-ansible-cassandra-mysql-ubuntu-ruby-on-rails-jobs-austin-tx.html]
3                                                                                                                                                                                                                               []
4    [http://neuvoo.com/job.php?id=dsvkrujig3&source=twitter&lang=en&client_id=658&l=Seattle

In [12]:
df['links'] = pd.DataFrame(url)
df

Unnamed: 0,tweetid,weekday,hour,day,lang,isreshare,reach,retweetcount,likes,klout,sentiment,text,locationid,userid,links
0,tw-682712873332805633,Thursday,17.0,31.0,en,0.0,44.0,0.0,0.0,35.0,0.0,We are hiring: Senior Software Engineer - Proto http://www.reqcloud.com/jobs/719865/?k=0LaPxXuFwczs1e32ZURJKrgCIDMQtRO7BquFSQthUKY&utm_source=twitter&utm_campaign=reqCloud_JobPost #job @awscloud #job #protocol #networking #aws #mediastreaming,3751.0,tw-40932430,[http://www.reqcloud.com/jobs/719865/?k=0LaPxXuFwczs1e32ZURJKrgCIDMQtRO7BquFSQthUKY&utm_source=twitter&utm_campaign=reqCloud_JobPost]
1,tw-682713045357998080,Thursday,17.0,31.0,en,1.0,1810.0,5.0,0.0,53.0,2.0,RT @CodeMineStatus: This is true Amazon Web Services https://aws.amazon.com/ #php #html #html5 #css #webdesign #seo #java #javascript htt,3989.0,tw-3179389829,[https://aws.amazon.com/]
2,tw-682713219375476736,Thursday,17.0,31.0,en,0.0,282.0,0.0,0.0,47.0,0.0,Devops Engineer Aws Ansible Cassandra Mysql Ubuntu Ruby On Rails Jobs in Austin TX #Austin #TX #jobs #jobsearch https://www.jobfindly.com/devops-engineer-aws-ansible-cassandra-mysql-ubuntu-ruby-on-rails-jobs-austin-tx.html,3741.0,tw-4624808414,[https://www.jobfindly.com/devops-engineer-aws-ansible-cassandra-mysql-ubuntu-ruby-on-rails-jobs-austin-tx.html]
3,tw-682713436967579648,Thursday,17.0,31.0,en,0.0,2087.0,4.0,0.0,53.0,0.0,Happy New Year to all those AWS instances of ours!,3753.0,tw-356447127,[]
4,tw-682714048199311366,Thursday,17.0,31.0,en,0.0,953.0,0.0,0.0,47.0,0.0,Amazon is hiring! #Sr. #International Tax Manager - AWS in #Seattle apply now! #jobs http://neuvoo.com/job.php?id=dsvkrujig3&source=twitter&lang=en&client_id=658&l=Seattle%20Washington%20US&k=Sr.%20International%20Tax%20Manager%20-%20AWS http://twitter.com/NeuvooAccSea/status/682714048199311366/photo/1,3751.0,tw-3172686669,"[http://neuvoo.com/job.php?id=dsvkrujig3&source=twitter&lang=en&client_id=658&l=Seattle%20Washington%20US&k=Sr.%20International%20Tax%20Manager%20-%20AWS, http://twitter.com/NeuvooAccSea/status/682714048199311366/photo/1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99994,tw-716349383688720385,Saturday,13.0,2.0,en,1.0,415.0,15.0,0.0,44.0,0.0,RT @awscloud: .@Jaspersoft built a #DataWarehouse that provides full cloud analytics using Redshift &amp; EMR! https://aws.amazon.com/solutions/case-studies/jaspersoft/?sc_channel=sm&sc_campaign=gc_dw&sc_publisher=tw_go&sc_content=jaspersoft&sc_country=global&sc_geo=global&sc_category=data_warehouse&trk=global_2016_data_warehouse&adbsc=gc_20160402_59959296&adbid=716285623842123776&adbpl=tw&adbpr=66780587 https://,611.0,tw-3316009302,"[https://aws.amazon.com/solutions/case-studies/jaspersoft/?sc_channel=sm&sc_campaign=gc_dw&sc_publisher=tw_go&sc_content=jaspersoft&sc_country=global&sc_geo=global&sc_category=data_warehouse&trk=global_2016_data_warehouse&adbsc=gc_20160402_59959296&adbid=716285623842123776&adbpl=tw&adbpr=66780587, https://]"
99996,tw-716349615340199937,Saturday,13.0,2.0,en,0.0,763.0,0.0,0.0,29.0,0.0,I added a video to a @YouTube playlist https://www.youtube.com/watch?v=YPf7wSEq9d0&feature=youtu.be&a Module 3 : AWS EC2 Instance Status Checks (System and Instance level),1332.0,tw-1143330170,[https://www.youtube.com/watch?v=YPf7wSEq9d0&feature=youtu.be&a]
99997,tw-716351557516722176,Saturday,13.0,2.0,en,1.0,122.0,435.0,0.0,17.0,1.0,RT @awscloud: Test your iOS Android &amp; web apps against real phones and tablets on AWS. Start today for free! https://cards.twitter.com/cards/13rc8b/1ihbc,1336.0,tw-139346924,[https://cards.twitter.com/cards/13rc8b/1ihbc]
99998,tw-716352501860196352,Saturday,13.0,2.0,en,1.0,287.0,30.0,0.0,42.0,0.0,RT @goserverless: Serverless Offline: Emulate AWS Lambda &amp; API Gateway locally to speed up your development cycles - https://t.co/CcXASlhQq,3795.0,tw-21826302,[https://t.co/CcXASlhQq]


# Removing links from tweets

In [13]:
df['text'].dropna(inplace=True)

In [14]:
def removelink(tweet):
    result = re.sub(r"http\S+", "", tweet)
    return result

In [17]:
df['text'] = df['text'].apply(lambda x: removelink(x))
df

Unnamed: 0,tweetid,weekday,hour,day,lang,isreshare,reach,retweetcount,likes,klout,sentiment,text,locationid,userid,links
0,tw-682712873332805633,Thursday,17.0,31.0,en,0.0,44.0,0.0,0.0,35.0,0.0,We are hiring: Senior Software Engineer - Proto #job @awscloud #job #protocol #networking #aws #mediastreaming,3751.0,tw-40932430,[http://www.reqcloud.com/jobs/719865/?k=0LaPxXuFwczs1e32ZURJKrgCIDMQtRO7BquFSQthUKY&utm_source=twitter&utm_campaign=reqCloud_JobPost]
1,tw-682713045357998080,Thursday,17.0,31.0,en,1.0,1810.0,5.0,0.0,53.0,2.0,RT @CodeMineStatus: This is true Amazon Web Services #php #html #html5 #css #webdesign #seo #java #javascript htt,3989.0,tw-3179389829,[https://aws.amazon.com/]
2,tw-682713219375476736,Thursday,17.0,31.0,en,0.0,282.0,0.0,0.0,47.0,0.0,Devops Engineer Aws Ansible Cassandra Mysql Ubuntu Ruby On Rails Jobs in Austin TX #Austin #TX #jobs #jobsearch,3741.0,tw-4624808414,[https://www.jobfindly.com/devops-engineer-aws-ansible-cassandra-mysql-ubuntu-ruby-on-rails-jobs-austin-tx.html]
3,tw-682713436967579648,Thursday,17.0,31.0,en,0.0,2087.0,4.0,0.0,53.0,0.0,Happy New Year to all those AWS instances of ours!,3753.0,tw-356447127,[]
4,tw-682714048199311366,Thursday,17.0,31.0,en,0.0,953.0,0.0,0.0,47.0,0.0,Amazon is hiring! #Sr. #International Tax Manager - AWS in #Seattle apply now! #jobs,3751.0,tw-3172686669,"[http://neuvoo.com/job.php?id=dsvkrujig3&source=twitter&lang=en&client_id=658&l=Seattle%20Washington%20US&k=Sr.%20International%20Tax%20Manager%20-%20AWS, http://twitter.com/NeuvooAccSea/status/682714048199311366/photo/1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99994,tw-716349383688720385,Saturday,13.0,2.0,en,1.0,415.0,15.0,0.0,44.0,0.0,RT @awscloud: .@Jaspersoft built a #DataWarehouse that provides full cloud analytics using Redshift &amp; EMR!,611.0,tw-3316009302,"[https://aws.amazon.com/solutions/case-studies/jaspersoft/?sc_channel=sm&sc_campaign=gc_dw&sc_publisher=tw_go&sc_content=jaspersoft&sc_country=global&sc_geo=global&sc_category=data_warehouse&trk=global_2016_data_warehouse&adbsc=gc_20160402_59959296&adbid=716285623842123776&adbpl=tw&adbpr=66780587, https://]"
99996,tw-716349615340199937,Saturday,13.0,2.0,en,0.0,763.0,0.0,0.0,29.0,0.0,I added a video to a @YouTube playlist Module 3 : AWS EC2 Instance Status Checks (System and Instance level),1332.0,tw-1143330170,[https://www.youtube.com/watch?v=YPf7wSEq9d0&feature=youtu.be&a]
99997,tw-716351557516722176,Saturday,13.0,2.0,en,1.0,122.0,435.0,0.0,17.0,1.0,RT @awscloud: Test your iOS Android &amp; web apps against real phones and tablets on AWS. Start today for free!,1336.0,tw-139346924,[https://cards.twitter.com/cards/13rc8b/1ihbc]
99998,tw-716352501860196352,Saturday,13.0,2.0,en,1.0,287.0,30.0,0.0,42.0,0.0,RT @goserverless: Serverless Offline: Emulate AWS Lambda &amp; API Gateway locally to speed up your development cycles -,3795.0,tw-21826302,[https://t.co/CcXASlhQq]


# Tokenizing

In [None]:
stopwords_list = stopwords.words('english') #+ list(string.punctuation)
# stopwords_list += ["''", '""', '...', '``']

In [None]:
def process_things(things):
    tokens = nltk.word_tokenize(things)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed

In [None]:
data = list(map(process_things, df['text']))
data