# Eye of the Emergency

## Libraries and dataset import

In [1]:
# Data import & Exploring
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt

# 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import string
import re

from sklearn.feature_extraction.text import CountVectorizer

# ML
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import xgboost as xgb

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\utile\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
train_df = pd.read_csv('train_tweets.csv')
test_df = pd.read_csv('test_tweets.csv')

## Data Preprocess

In [3]:
unique_count = pd.DataFrame(pd.value_counts(train_df.text))
unique_count = unique_count.reset_index()
unique_count.columns = ['text', 'count']

In [4]:
#unique_count.head()

In [5]:
train_df_unique = pd.DataFrame(train_df.text.unique(), columns = ['text'])

In [6]:
#train_df_unique.head()

In [7]:
train_df_unique = train_df_unique.merge(unique_count)

In [8]:
#train_df_unique.head()

In [9]:
train_df_count = train_df.merge(unique_count)

In [10]:
#train_df_count.head()

In [11]:
lg = pd.DataFrame([len(txt) for txt in train_df.text], columns = ['len_text'])

In [12]:
#lg.head()

In [13]:
train_df_len = pd.concat([train_df_count, lg], axis = 1)

In [14]:
#train_df_len.head()

### Location Encoder

In [15]:
encode_location = pd.DataFrame([int(type(ct)==str) for ct in train_df_len.location], columns=['location_enc'])

In [16]:
#encode_location.head()

In [17]:
train_df_location = pd.concat([train_df_len, encode_location], axis=1)

In [18]:
train_df_location.head()

Unnamed: 0,id,keyword,location,text,target,count,len_text,location_enc
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,69,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,38,0
2,5,,,All residents asked to 'shelter in place' are ...,1,1,133,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,65,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,88,0


## NLTK

In [19]:
df = train_df_location.copy()

In [20]:
df2 = train_df_location.copy()

In [21]:
# text cleaning

def text_processing(text):
     #Charger les stop-words en anglais
    stop_words = set(stopwords.words('english'))

    # Initialiser le lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Appliquer la tokenisation à tous les textes
    tokens = word_tokenize(text.lower())

    # Supprimer les ponctuations
    tokens = [word for word in tokens if word not in string.punctuation]
    

    # Supprimer les stop-words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Supprimer les stop-words BIS
    tokens = [word for word in tokens if word != "http"]
    tokens = [word for word in tokens if word[0:7] != "//t.co/"]
    

    # Appliquer la lemmatisation à tous les tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    tokens = ' '.join(tokens)
    
    return tokens
    

# Charger l'ensemble de données
#df = pd.read_csv('votre_fichier.csv')

In [22]:
# Appliquer la fonction Cleanup à tous les textes dans la colonne "text"
df2['text_process'] = df['text'].apply(text_processing)

In [23]:
df2

Unnamed: 0,id,keyword,location,text,target,count,len_text,location_enc,text_process
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,69,0,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,38,0,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,1,133,0,resident asked 'shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,65,0,"13,000 people receive wildfire evacuation orde..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,88,0,got sent photo ruby alaska smoke wildfire pour...
...,...,...,...,...,...,...,...,...,...
7608,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1,1,83,0,worldnews fallen powerlines g link tram update...
7609,10864,,,on the flip side I'm at Walmart and there is a...,1,1,125,0,flip side 'm walmart bomb everyone evacuate st...
7610,10866,,,Suicide bomber kills 15 in Saudi security site...,1,1,65,0,suicide bomber kill 15 saudi security site mos...
7611,10869,,,Two giant cranes holding a bridge collapse int...,1,1,137,0,two giant crane holding bridge collapse nearby...


In [24]:
df2.text_process[7600]



## ML : Word Embedding - SKLEARN

In [26]:
X = df2['text_process']

In [27]:
X

0              deed reason earthquake may allah forgive u
1                   forest fire near la ronge sask canada
2       resident asked 'shelter place notified officer...
3       13,000 people receive wildfire evacuation orde...
4       got sent photo ruby alaska smoke wildfire pour...
                              ...                        
7608    worldnews fallen powerlines g link tram update...
7609    flip side 'm walmart bomb everyone evacuate st...
7610    suicide bomber kill 15 saudi security site mos...
7611    two giant crane holding bridge collapse nearby...
7612    latest home razed northern california wildfire...
Name: text_process, Length: 7613, dtype: object

In [28]:
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

In [30]:
pd.DataFrame(X_vec)

Unnamed: 0,0
0,"(0, 3971)\t1\n (0, 11465)\t1\n (0, 4666)\t..."
1,"(0, 5654)\t1\n (0, 5482)\t1\n (0, 9653)\t1..."
2,"(0, 11708)\t1\n (0, 1394)\t1\n (0, 12577)\..."
3,"(0, 5075)\t1\n (0, 10223)\t1\n (0, 89)\t1\..."
4,"(0, 15335)\t1\n (0, 6165)\t1\n (0, 12442)\..."
...,...
7608,"(0, 5482)\t1\n (0, 14731)\t1\n (0, 5276)\t..."
7609,"(0, 12660)\t1\n (0, 5099)\t1\n (0, 13271)\..."
7610,"(0, 9721)\t1\n (0, 12749)\t1\n (0, 105)\t1..."
7611,"(0, 14558)\t1\n (0, 6756)\t1\n (0, 3223)\t..."


In [32]:
tfidf = TfidfVectorizer()

In [33]:
#X_train_vec = tfidf.fit_transform(df2['text'].to_numpy())

In [34]:
#pd.DataFrame(X_train_vec).head()

In [35]:
X_vec = tfidf.fit_transform(X)

In [36]:
X_vec

<7613x15840 sparse matrix of type '<class 'numpy.float64'>'
	with 70097 stored elements in Compressed Sparse Row format>

In [37]:
pd.DataFrame(X_vec)

Unnamed: 0,0
0,"(0, 5662)\t0.4916801595530426\n (0, 995)\t0..."
1,"(0, 2618)\t0.38768584886076835\n (0, 12227)..."
2,"(0, 5166)\t0.2516652226079931\n (0, 10223)\..."
3,"(0, 2579)\t0.2760495231763834\n (0, 15335)\..."
4,"(0, 12297)\t0.27148235929883796\n (0, 10910..."
...,...
7608,"(0, 14300)\t0.3537612084336819\n (0, 5073)\..."
7609,"(0, 15083)\t0.3858687397178091\n (0, 14501)..."
7610,"(0, 2187)\t0.2541182175049369\n (0, 15081)\..."
7611,"(0, 9654)\t0.37054956786543136\n (0, 6737)\..."


In [None]:
feature_names = tfidf.get_feature_names()

In [None]:
feature_names

In [None]:
#for col in X_train_vec.nonzero()[1]:
 #   print (feature_names[col], ' - ', X_train_vec[0, col])

## ML : Dataset Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, df2['target'], test_size=0.2, random_state=42)


## ML : SVM avec sklearn

In [None]:
#instanciation
model_SVC = SVC() #kernel = 'linear', gamma = 'scale', shrinking = False)

In [None]:
#training
model_SVC.fit(X_train, y_train)

In [None]:
#calcul de la précision
model_SVC.score(X_test, y_test)

In [None]:
def fonction_SVM(tweet):
    tw = []
    tw.append(tweet)
    #Prédiction
    X_1 = pd.Series([tweet])
    X_1 = tfidf.transform(X_1)
    pred = model_SVC.predict(X_1)

    #affichage des résultats
    resultat = "Résultat : "
    if pred[0] == 0:
        resultat = resultat + "NO DISASTER"
    if pred[0] == 1:
        resultat = resultat + "DISASTER"
        
    return resultat

In [None]:
#Prédiction
fonction_SVM("Help me, my house is on fire and all the forest is burning")

In [None]:
#Prédiction
fonction_SVM("forest")

## ML : xgboost

The data is stored in a DMatrix object.

In [None]:
X_train

In [None]:
pd.DataFrame(X_train).head()

In [None]:
type(X_train)

In [None]:
# Create regression matrices
dtrain = xgb.DMatrix(X_train)

In [None]:
# Create regression matrices
dtrain = xgb.DMatrix(X_train, y_train)#, enable_categorical=True)
dtest = xgb.DMatrix(X_test, y_test) #, enable_categorical=True)
np.asarray(dtest)

In [None]:
# Define hyperparameters
param = {'objective': 'binary:logistic', 'eval_metric': 'auc'}

In [None]:
# Define hyperparameters
model_xgb = xgb.train(param, dtrain, num_boost_round=10)

In [None]:
ypred = model_xgb.predict(dtest)
ypred

In [None]:
#Prédiction
pred = pd.Series(["Hello"])
X_test_vec = tfidf.transform(pred)
dpred = xgb.DMatrix(X_test_vec)

In [None]:
prediction = model_xgb.predict(dpred)
prediction

In [None]:
#accuracy = sum(prediction == y_test) / len(y_test)
#print('Accuracy: ', accuracy)

### API SKL

In [None]:
model_xgb_skl = xgb.XGBClassifier()

## ML : SKL Log Reg

In [None]:
X_train_arr = np.asarray(X_train.toarray())

In [None]:
X_train_arr

In [None]:
X_train_arr_df = pd.DataFrame(X_train_arr)

In [None]:
X_train_arr_df.describe()

In [None]:
model_logreg = sm.Logit(y_train, X_train)

In [None]:
result = model_logreg.fit()

In [None]:
result.summary2()

In [None]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [None]:
print('Model accuracy (%) : {:.1f}'.format(logreg.score(X_test, y_test) * 100))

In [None]:
roc_auc = roc_auc_score(y_test, logreg.predict(X_test))

fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])

In [None]:
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()