## Real or Not? NLP with Disaster Tweets - Project 2 - Sam's part

Big-Scale Analytics - Project 2 - Team Rolex
> Samuel Lew, Alexandre Lang, Samy Bouzerda, Alix Muller

## Load dataset

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv ('../data/train.csv')
df = df[['id', 'text', 'target']]
df.iloc[:10]

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1
5,8,#RockyFire Update => California Hwy. 20 closed...,1
6,10,#flood #disaster Heavy rain causes flash flood...,1
7,13,I'm on top of the hill and I can see a fire in...,1
8,14,There's an emergency evacuation happening now ...,1
9,15,I'm afraid that the tornado is coming to our a...,1


In [4]:
index = df.index
columns = df.columns
values = df.values

In [5]:
df_yes = df.query('target == 1')
yes_count = 0
for row in df_yes.iterrows():
    yes_count += 1
print(f'there are {yes_count} tweets about disasters')

df_no = df.query('target == 0')
no_count = 0
for row in df_no.iterrows():
    no_count += 1
print(f'there are {no_count} tweets that are not about disasters')

there are 3271 tweets about disasters
there are 4342 tweets that are not about disasters


## Process text

In [6]:
#!pip install spacy
#!python -m spacy download en

In [7]:
nlp = English()
sbd = nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)

spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

### Removing stopwords & lowercasing every word:

In [15]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

punctuations = string.punctuation

nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser = English()

filtered_tweets = []
for tweet in df['text']:
    tweet = nlp(tweet)
    filt_tweet = ' '
    for word in tweet:
        if word.is_stop == False and word.lemma_ != "-PRON-" and word not in stop_words or punctuations:
            filt_tweet = filt_tweet + ' ' + str(word)
          
    filtered_tweets.append(filt_tweet)
    
df['filtered_tweet'] = filtered_tweets

def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

df['filtered_tweet'] = df['filtered_tweet'].map(clean_text)

df.head(3)

Unnamed: 0,id,text,target,filtered_tweet
0,1,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this # earthquake ...
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask . canada
2,5,All residents asked to 'shelter in place' are ...,1,all residents asked to ' shelter in place ' ar...


In [9]:


for row in df:
  
  for word in ['filtered_tweet']:
    print(word.lemma)

[ word.lemma_.lower().strip() 
                 else word.lower_ for word in df['filtered_tweet'] ]
[ word for word in df['filtered_tweet']
                if word not in stop_words and word not in punctuations ]

AttributeError: 'str' object has no attribute 'lemma_'

### Logistic regression:

In [40]:
#!python -m spacy download en_core_web_sm

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline


In [41]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

punctuations = string.punctuation

nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser = English()

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() 
                if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens 
                if word not in stop_words and word not in punctuations ]

    return mytokens


tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [49]:
from sklearn.model_selection import train_test_split

X = df['text']
ylabels = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, random_state=72)

In [50]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver="lbfgs")

# Create pipeline using Bag of Words
pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

from sklearn import metrics
predicted = pipe.predict(X_test)

print(" test Accuracy:",metrics.accuracy_score(y_test, predicted))
print(" Precision:",metrics.precision_score(y_test, predicted, average=None))
print(" Recall:",metrics.recall_score(y_test, predicted, average=None))

 test Accuracy: 0.8047285464098074
 Precision: [0.78066667 0.85076531]
 Recall: [0.90916149 0.66967871]


#### we can see here that our model identified a tweet about a disaster 80.47% of the time
#### when it predicted a tweet to be about a disaster, it was correctly assessed 78.07% of the time, and was correctly assessed for a tweet not about a disaster 85.08% of the time
#### when given a tweet about a disaster, the model considered it as being about a disaster 90.92% of the time, and when given a tweet not about a disaster the model considered it as not a disaster for 66.97% of the time