# Eye of the Emergency

## Libraries and dataset import

In [1]:
# Data import & Exploring
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt

# 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import string
import re

from sklearn.feature_extraction.text import CountVectorizer

# ML
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import xgboost as xgb

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
train_df = pd.read_csv('train_tweets.csv')
test_df = pd.read_csv('test_tweets.csv')

## Data Preprocess

In [3]:
unique_count = pd.DataFrame(pd.value_counts(train_df.text))
unique_count = unique_count.reset_index()
unique_count.columns = ['text', 'count']

In [4]:
#unique_count.head()

In [5]:
train_df_unique = pd.DataFrame(train_df.text.unique(), columns = ['text'])
#train_df_unique.head()

In [6]:
train_df_unique = train_df_unique.merge(unique_count)
#train_df_unique.head()

In [7]:
train_df_count = train_df.merge(unique_count)

In [8]:
#train_df_count.head()

In [9]:
lg = pd.DataFrame([len(txt) for txt in train_df.text], columns = ['len_text'])
#lg.head()

In [10]:
train_df_len = pd.concat([train_df_count, lg], axis = 1)
#train_df_len.head()

### Location Encoder

In [11]:
encode_location = pd.DataFrame([int(type(ct)==str) for ct in train_df_len.location], columns=['location_enc'])

In [12]:
train_df_location = pd.concat([train_df_len, encode_location], axis=1)
train_df_location.head()

Unnamed: 0,id,keyword,location,text,target,count,len_text,location_enc
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,69,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,38,0
2,5,,,All residents asked to 'shelter in place' are ...,1,1,133,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,65,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,88,0


## NLTK

In [13]:
df = train_df_location.copy()

In [14]:
df2 = train_df_location.copy()

In [15]:
# text cleaning

def text_processing(text):
     #Charger les stop-words en anglais
    stop_words = set(stopwords.words('english'))

    # Initialiser le lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Appliquer la tokenisation à tous les textes
    tokens = word_tokenize(text.lower())

    # Supprimer les ponctuations
    tokens = [word for word in tokens if word not in string.punctuation]
    

    # Supprimer les stop-words
    tokens = [word for word in tokens if word not in stop_words]
    

    # Appliquer la lemmatisation à tous les tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    #tokens = ' '.join(tokens)
    return tokens
    

# Charger l'ensemble de données
#df = pd.read_csv('votre_fichier.csv')

In [16]:
# Appliquer la fonction Cleanup à tous les textes dans la colonne "text"
df2['text_process'] = df['text'].apply(text_processing)

In [17]:
df2.head()

Unnamed: 0,id,keyword,location,text,target,count,len_text,location_enc,text_process
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,69,0,"[deed, reason, earthquake, may, allah, forgive..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,38,0,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,1,133,0,"[resident, asked, 'shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,65,0,"[13,000, people, receive, wildfire, evacuation..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,88,0,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [18]:
df2.text_process[0]

['deed', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'u']

## ML : Dataset Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df2['text_process'], df2['target'], test_size=0.2, random_state=42)


## ML : Word Embedding - SKLEARN

In [20]:
#vectorizer = CountVectorizer()
#X_train_vec = vectorizer.fit_transform(X_train)
#X_test_vec = vectorizer.transform(X_test)

In [21]:
tfidf = TfidfVectorizer()

In [22]:
X_train

4996    [stfxuniversity, people, died, human, experime...
3263    [engulfed, low, self-image, take, quiz, http, ...
4907    [cia, hey, guy, 's, stopped, massacre, send, c...
2855    [elem, pomo, helping, displaced, rocky, fire, ...
4716    [morning_joe, reince, presssec, joe, ur, smart...
                              ...                        
5226    [auntiedote, rioslade, locke_wiggins, akarb74,...
5390    [dream, magic, linden, method, lite, version, ...
860     [omron, hem-712c, automatic, blood, pressure, ...
7603    [father-of-three, lost, control, car, overtaki...
7270    [pawsox, owner, public, return, whirlwind, tri...
Name: text_process, Length: 6090, dtype: object

In [23]:
X_train_vec = tfidf.fit_transform(df2['text'].to_numpy())
pd.DataFrame(X_train_vec).head()

Unnamed: 0,0
0,"(0, 1851)\t0.20827049400579561\n (0, 19774)..."
1,"(0, 3843)\t0.385143605810349\n (0, 16611)\t..."
2,"(0, 7014)\t0.21673027088707333\n (0, 13936)..."
3,"(0, 3797)\t0.25856984887413254\n (0, 20607)..."
4,"(0, 16700)\t0.23312651658438782\n (0, 9947)..."


In [24]:
X_test_vec = tfidf.transform(X_test)
X_test_vec

AttributeError: 'list' object has no attribute 'lower'

In [None]:
feature_names = tfidf.get_feature_names()
#feature_names

In [None]:
#for col in X_train_vec.nonzero()[1]:
 #   print (feature_names[col], ' - ', X_train_vec[0, col])

## ML : SVM avec sklearn

In [None]:
#instanciation
model_SVC = SVC() #kernel = 'linear', gamma = 'scale', shrinking = False)

In [None]:
#training
model_SVC.fit(X_train_vec, y_train)

In [None]:
#calcul de la précision
model_SVC.score(X_test_vec, y_test)

In [None]:
#Prédiction
pred = pd.Series(["Help me, my house is on fire and all the forest is burning"])
X_test_vec = tfidf.transform(pred)
prediction = model_SVC.predict(X_test_vec)

In [None]:
#affichage des résultats
resultat = "Résultat : "
if prediction[0] == 0:
    resultat = resultat + "NO DISASTER"
if prediction[0] == 1:
    resultat = resultat + "DISASTER"

In [None]:
resultat

In [None]:
#Prédiction
pred = pd.Series(["Hello"])
X_test_vec = tfidf.transform(pred)
prediction = model_SVC.predict(X_test_vec)

In [None]:
#affichage des résultats
resultat = "Résultat : "
if prediction[0] == 0:
    resultat = resultat + "NO DISASTER"
if prediction[0] == 1:
    resultat = resultat + "DISASTER"

In [None]:
resultat

## ML : xgboost

The data is stored in a DMatrix object.

In [None]:
X_train

In [None]:
pd.DataFrame(X_train).head()

In [None]:
pd.DataFrame(X_train_vec).head()

In [None]:
type(X_train_vec)

In [None]:
# Create regression matrices
dtrain = xgb.DMatrix(X_train_vec)

In [None]:
# Create regression matrices
dtrain = xgb.DMatrix(X_train_vec, y_train)#, enable_categorical=True)
dtest = xgb.DMatrix(X_test_vec, y_test) #, enable_categorical=True)
np.asarray(dtest)

In [None]:
# Define hyperparameters
param = {'objective': 'binary:logistic', 'eval_metric': 'auc'}

In [None]:
# Define hyperparameters
model_xgb = xgb.train(param, dtrain, num_boost_round=10)

In [None]:
ypred = model_xgb.predict(dtest)
ypred

In [None]:
#Prédiction
pred = pd.Series(["Hello"])
X_test_vec = tfidf.transform(pred)
dpred = xgb.DMatrix(X_test_vec)

In [None]:
prediction = model_xgb.predict(dpred)
prediction

In [None]:
#accuracy = sum(prediction == y_test) / len(y_test)
#print('Accuracy: ', accuracy)

### API SKL

In [None]:
model_xgb_skl = xgb.XGBClassifier()

## ML : SKL Log Reg

In [None]:
X_train_vec_arr = np.asarray(X_train_vec.toarray())

In [None]:
X_train_vec_arr

In [None]:
X_train_vec_arr_df = pd.DataFrame(X_train_vec_arr)

In [None]:
X_train_vec_arr_df.describe()

In [None]:
model_logreg = sm.Logit(y_train, X_train_vec)

In [None]:
result = model_logreg.fit()

In [None]:
result.summary2()

In [None]:
logreg = LogisticRegression()

logreg.fit(X_train_vec, y_train)

y_pred = logreg.predict(X_test_vec)

In [None]:
print('Model accuracy (%) : {:.1f}'.format(logreg.score(X_test_vec, y_test) * 100))

In [None]:
roc_auc = roc_auc_score(y_test, logreg.predict(X_test_vec))

fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test_vec)[:,1])

In [None]:
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()