# import needed libraries

In [1]:

import os
import requests
import pandas
import time
import regex

# Import Gensim.
import gensim

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier



#turn off the really annoying deprecation messages
import warnings
warnings.filterwarnings('ignore')



#VERY usefull to be able to single step code and check values in medias res
from IPython.core.debugger import set_trace

# Import twitter data to a pandas dataframe

In [2]:
# set global path/filename components

output_data_dir = './final_data_sets/'

#load training data
train_input_path = './data_collection_twitter/'
train_input_data_filename = '2012_Sandy_Hurricane-ontopic_offtopic.csv'
train_output_data_filename = 'cleaned_tweet_train_data.csv'

#load test data
test_input_path = './data_collection_twitter/'
test_input_data_filename = 'tweets_results.csv'
test_output_data_filename = 'cleaned_tweet_test_data.csv'

In [3]:
#load train data
train_twitter_dataframe = pandas.DataFrame()

train_twitter_dataframe = pandas.read_csv(train_input_path+train_input_data_filename)
train_twitter_dataframe.head()

Unnamed: 0,tweet id,tweet,label
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,off-topic
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,on-topic
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,off-topic
3,'263422851133079552',@taos you never got that magnificent case of B...,off-topic
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",off-topic


In [4]:
train_twitter_dataframe.shape

(10008, 3)

In [5]:
train_twitter_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10008 entries, 0 to 10007
Data columns (total 3 columns):
tweet id    10008 non-null object
 tweet      10008 non-null object
 label      10008 non-null object
dtypes: object(3)
memory usage: 234.6+ KB


# Process columns/rows
 - Keep the following columns
    - ' tweet' 
    - ' label'
    
 - Drop the following columns
 	- 'tweet id'
 	- 'Information Source'
 	- 'Information Type'


In [6]:
#drop the id
train_twitter_dataframe.drop(columns=[ 'tweet id'],
                        axis=1,
                        inplace=True)

In [7]:
#rename the columns
train_twitter_dataframe.rename(columns={' tweet':'Tweet Text',
                          ' label' :'Informativeness'}, 
                 inplace=True)

In [8]:
train_twitter_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10008 entries, 0 to 10007
Data columns (total 2 columns):
Tweet Text         10008 non-null object
Informativeness    10008 non-null object
dtypes: object(2)
memory usage: 156.5+ KB


In [9]:
train_twitter_dataframe.head()

Unnamed: 0,Tweet Text,Informativeness
0,I've got enough candles to supply a Mexican fa...,off-topic
1,Sandy be soooo mad that she be shattering our ...,on-topic
2,@ibexgirl thankfully Hurricane Waugh played it...,off-topic
3,@taos you never got that magnificent case of B...,off-topic
4,"I'm at Mad River Bar &amp; Grille (New York, N...",off-topic


In [10]:
train_twitter_dataframe.isna().sum()

Tweet Text         0
Informativeness    0
dtype: int64

In [11]:
train_twitter_dataframe.to_csv(output_data_dir+train_output_data_filename)

In [12]:
#Load test data
test_twitter_dataframe = pandas.DataFrame()

#test_twitter_dataframe = pandas.read_csv(data_dir+"tweets_results.csv")
test_twitter_dataframe = pandas.read_csv(test_input_path+test_input_data_filename)
test_twitter_dataframe.head()

Unnamed: 0,id,created_at,text,user,user_id,user_name,user_location,user_description,user_followers,retweet_count,favorite_count,lang,is_quote_status,place,place_name,place_country,coordinates,coordinates_longitude,coordinates_latitude
0,260244087901413376,Mon Oct 22 05:00:00 +0000 2012,I suppose she has an appropriate costume for e...,"{'profile_sidebar_fill_color': 'DDEEF6', 'is_t...",24753438,William C. Statham,"New York, NY",🎭Social Media Influencer🎭 Facebook: william.co...,726,0,0,en,False,"{'full_name': 'West Long Branch, NJ', 'country...","West Long Branch, NJ",US,"[-74.037008, 40.272289]",-74.037008,40.272289
1,260244088161439744,Mon Oct 22 05:00:00 +0000 2012,@NOT_savinHOES Not r yu upp,"{'profile_sidebar_fill_color': 'DDEEF6', 'is_t...",401231570,Jay 🤷🏽‍♂️,"Washington, DC, USA",кιиg ʝαмєѕ🎩🏆 25. D[M]V | 👻:OfficialJaymes 📸Ins...,815,0,0,en,False,"{'full_name': 'Bressler-Enhaut-Oberlin, PA', '...","Bressler-Enhaut-Oberlin, PA",US,"[-76.831479, 40.22417]",-76.831479,40.22417
2,260244088819945472,Mon Oct 22 05:00:00 +0000 2012,Hit and Run is so sad..,"{'profile_sidebar_fill_color': 'DDEEF6', 'is_t...",123368790,Cheree Mercedez💯♥️,"Fayetteville, NC",,696,0,0,en,False,"{'full_name': 'South Carolina, USA', 'country_...","South Carolina, USA",US,"[-83.353955, 32.04683]",-83.353955,32.04683
3,260244089080004609,Mon Oct 22 05:00:00 +0000 2012,Who's up?,"{'profile_sidebar_fill_color': 'A0C5C7', 'is_t...",47812293,TheLitRoom,World Wide,"Entertainment news (Music, Sports, Culture, ec...",1368,0,0,en,False,"{'full_name': 'Malden, MA', 'country_code': 'U...","Malden, MA",US,"[-71.089522, 42.412466]",-71.089522,42.412466
4,260244089985957888,Mon Oct 22 05:00:00 +0000 2012,@augustushazel idk I'm just ugly or annoying o...,"{'profile_sidebar_fill_color': 'EFEFEF', 'is_t...",274750107,moon boi,"Erie, PA",,249,0,0,en,False,"{'full_name': 'Erie, PA', 'country_code': 'US'...","Erie, PA",US,"[-80.239991, 42.018414]",-80.239991,42.018414


In [13]:
test_twitter_dataframe.shape

(102255, 19)

# Process columns/rows
 - Keep the following columns
    - 'created_at', 
    - 'text'
    - 'retweet_count'
    - 'place_name'
    - 'coordinates_longitude'
    - 'coordinates_latitude'
    
 - Drop the following columns
 	- id
 	- user
 	- user_id
 	- user_name
 	- user_description
 	- user_followers
 	- user_location
 	- favorite_count
 	- lang
 	- is_quote_status	
 	- place
 	- place_country
 	- coordinates

In [14]:
test_twitter_dataframe.drop(columns=[ 'id',
                                'user',
                                'user_id',
                                'user_name',
                                'user_description',
                                'user_followers',
                                'user_location',    
                                'favorite_count',
                                'is_quote_status',
                                'place',
                                'place_country',
                                'coordinates'
                               ],
                        inplace=True)
test_twitter_dataframe.head()

Unnamed: 0,created_at,text,retweet_count,lang,place_name,coordinates_longitude,coordinates_latitude
0,Mon Oct 22 05:00:00 +0000 2012,I suppose she has an appropriate costume for e...,0,en,"West Long Branch, NJ",-74.037008,40.272289
1,Mon Oct 22 05:00:00 +0000 2012,@NOT_savinHOES Not r yu upp,0,en,"Bressler-Enhaut-Oberlin, PA",-76.831479,40.22417
2,Mon Oct 22 05:00:00 +0000 2012,Hit and Run is so sad..,0,en,"South Carolina, USA",-83.353955,32.04683
3,Mon Oct 22 05:00:00 +0000 2012,Who's up?,0,en,"Malden, MA",-71.089522,42.412466
4,Mon Oct 22 05:00:00 +0000 2012,@augustushazel idk I'm just ugly or annoying o...,0,en,"Erie, PA",-80.239991,42.018414


In [15]:
test_twitter_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102255 entries, 0 to 102254
Data columns (total 7 columns):
created_at               102255 non-null object
text                     102254 non-null object
retweet_count            102255 non-null int64
lang                     102255 non-null object
place_name               102255 non-null object
coordinates_longitude    102255 non-null object
coordinates_latitude     102255 non-null object
dtypes: int64(1), object(6)
memory usage: 5.5+ MB


In [16]:
test_twitter_dataframe.isna().sum()

created_at               0
text                     1
retweet_count            0
lang                     0
place_name               0
coordinates_longitude    0
coordinates_latitude     0
dtype: int64

In [17]:
test_twitter_dataframe = test_twitter_dataframe.dropna()

In [18]:
test_twitter_dataframe.isna().sum()

created_at               0
text                     0
retweet_count            0
lang                     0
place_name               0
coordinates_longitude    0
coordinates_latitude     0
dtype: int64

In [19]:
test_twitter_dataframe.shape

(102254, 7)

In [20]:
test_twitter_dataframe.to_csv(output_data_dir+test_output_data_filename)

# import word2vec data

In [24]:
# ### Start timer.
# t0 = time.time()

# ### Import word vectors into "model."
# model = gensim.models.KeyedVectors.load_word2vec_format('./WordVec_dictionary/lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors')

# ### Print results of timer.
# print(time.time() - t0)

# identify words to check out

In [25]:
# disaster_words = model.most_similar(["hurricane", "tornado", "flood", "earthquake"], topn=30)