# Eye of the Emergency

## Libraries and dataset import

In [25]:
import pandas as pd

# 

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import string
import re

from sklearn.feature_extraction.text import CountVectorizer

# ML
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
train_df = pd.read_csv('train_tweets.csv')
test_df = pd.read_csv('test_tweets.csv')

## Data Preprocess

In [3]:
unique_count = pd.DataFrame(pd.value_counts(train_df.text))
unique_count = unique_count.reset_index()
unique_count.columns = ['text', 'count']

In [4]:
#unique_count.head()

In [5]:
train_df_unique = pd.DataFrame(train_df.text.unique(), columns = ['text'])
#train_df_unique.head()

In [6]:
train_df_unique = train_df_unique.merge(unique_count)
#train_df_unique.head()

In [7]:
train_df_count = train_df.merge(unique_count)

In [8]:
#train_df_count.head()

In [9]:
lg = pd.DataFrame([len(txt) for txt in train_df.text], columns = ['len_text'])
#lg.head()

In [10]:
train_df_len = pd.concat([train_df_count, lg], axis = 1)
#train_df_len.head()

### Location Encoder

In [11]:
encode_location = pd.DataFrame([int(type(ct)==str) for ct in train_df_len.location], columns=['location_enc'])

In [12]:
train_df_location = pd.concat([train_df_len, encode_location], axis=1)
train_df_location.head()

Unnamed: 0,id,keyword,location,text,target,count,len_text,location_enc
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,69,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,38,0
2,5,,,All residents asked to 'shelter in place' are ...,1,1,133,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,65,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,88,0


## Data Visualization

In [13]:
stopwords = pd.read_csv('stopwords.txt', header = None)
stopwords.columns=['words']

In [14]:
stopwords = [wrd for wrd in stopwords.words]

In [15]:
stopwords.append('http')
stopwords.append('https')
stopwords.append('Û_')
stopwords.append('amp')

In [16]:
text = [txt for txt in train_df_location.text]
#text

In [17]:
my_lst_str = ' '.join(map(str, text))
#my_lst_str

In [18]:
text_0 = [txt for txt in train_df_location.text[train_df_location.target == 0]]
text_1 = [txt for txt in train_df_location.text[train_df_location.target == 1]]

In [19]:
my_lst_str_0 = ' '.join(map(str, text_0))
my_lst_str_1 = ' '.join(map(str, text_1))

In [20]:
#my_lst_str_0
#my_lst_str_1

## NLTK

In [21]:
df = train_df_location.copy()

In [22]:
df2 = train_df_location.copy()

In [23]:
# text cleaning

def text_processing(text):
     #Charger les stop-words en anglais
    stop_words = set(stopwords.words('english'))

    # Initialiser le lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Appliquer la tokenisation à tous les textes
    tokens = word_tokenize(text.lower())

    # Supprimer les ponctuations
    tokens = [word for word in tokens if word not in string.punctuation]
    

    # Supprimer les stop-words
    tokens = [word for word in tokens if word not in stop_words]
    

    # Appliquer la lemmatisation à tous les tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    tokens = ' '.join(tokens)
    return tokens
    

# Charger l'ensemble de données
#df = pd.read_csv('votre_fichier.csv')

In [26]:
# Appliquer la fonction Cleanup à tous les textes dans la colonne "text"
df2['text_process'] = df['text'].apply(text_processing)

In [27]:
df2.head()

Unnamed: 0,id,keyword,location,text,target,count,len_text,location_enc,text_process
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,69,0,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,38,0,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,1,133,0,resident asked 'shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,65,0,"13,000 people receive wildfire evacuation orde..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,88,0,got sent photo ruby alaska smoke wildfire pour...


In [28]:
df2.text_process[0]

'deed reason earthquake may allah forgive u'

## ML : SVM avec sklearn

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df2['text_process'], df2['target'], test_size=0.2, random_state=42)


In [30]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [31]:
#instanciation
model_SVC = SVC(kernel = 'linear', gamma = 'scale', shrinking = False)

In [32]:
#training
model_SVC.fit(X_train_vec, y_train)

SVC(kernel='linear', shrinking=False)

In [33]:
#calcul de la précision
model_SVC.score(X_test_vec, y_test)

0.7905449770190414

In [34]:
#Prédiction
pred = pd.Series(["Help me, my house is on fire and all the forest is burning"])
X_test_vec = vectorizer.transform(pred)
prediction = model_SVC.predict(X_test_vec)

In [35]:
#affichage des résultats
resultat = "Résultat : "
if prediction[0] == 0:
    resultat = resultat + "NO DISASTER"
if prediction[0] == 1:
    resultat = resultat + "DISASTER"

In [36]:
resultat

'Résultat : DISASTER'

In [37]:
#Prédiction
pred = pd.Series(["Hello"])
X_test_vec = vectorizer.transform(pred)
prediction = model_SVC.predict(X_test_vec)

In [38]:
#affichage des résultats
resultat = "Résultat : "
if prediction[0] == 0:
    resultat = resultat + "NO DISASTER"
if prediction[0] == 1:
    resultat = resultat + "DISASTER"

In [39]:
resultat

'Résultat : NO DISASTER'