In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn. model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
tweets_train_data = pd.read_csv('C:/Users/Admin/Downloads/nlp-getting-started/train.csv')
tweets_test_data = pd.read_csv('C:/Users/Admin/Downloads/nlp-getting-started/test.csv')

In [5]:
tweets_train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
tweets_train_data.shape

(7613, 5)

In [7]:
tweets_train_data = tweets_train_data.set_index('id')
tweets_test_data = tweets_test_data.set_index('id')

In [8]:
len_train = len(tweets_train_data)
len_test = len(tweets_test_data)

In [9]:
tweets_test_data.head()

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
9,,,Apocalypse lighting. #Spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
tweets_test_data.shape

(3263, 3)

In [11]:
tweets_data = pd.concat([tweets_train_data,tweets_test_data])

In [12]:
tweets_data.shape

(10876, 4)

In [13]:
tweets_data.isnull().sum()

keyword       87
location    3638
text           0
target      3263
dtype: int64

In [14]:
tweets_data = tweets_data.fillna('')

In [15]:
tweets_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10876 entries, 1 to 10875
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   10876 non-null  object
 1   location  10876 non-null  object
 2   text      10876 non-null  object
 3   target    10876 non-null  object
dtypes: object(4)
memory usage: 424.8+ KB


In [16]:
tweets_data['target'] = tweets_data['target'].astype(bool)

In [17]:
tweets_data['target'] = tweets_data['target'].astype(float)

In [18]:
tweets_data.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1.0
4,,,Forest fire near La Ronge Sask. Canada,1.0
5,,,All residents asked to 'shelter in place' are ...,1.0
6,,,"13,000 people receive #wildfires evacuation or...",1.0
7,,,Just got sent this photo from Ruby #Alaska as ...,1.0


In [19]:
tweets_data2 = tweets_data.copy()

In [20]:
X = tweets_data2.drop('target',axis=1)
y = tweets_data2['target']

In [21]:
port_stem = PorterStemmer()

In [22]:
def stemming(text):
    stemmed_content = re.sub('[^a-zA-Z]',' ',text)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [23]:
tweets_data2['text']= tweets_data2['text'].apply(stemming)

In [24]:
X = tweets_data2['text']
y = tweets_data2['target']

In [25]:
X_train = X[:len_train].drop(columns ='target')
y_train = y[:len_train]
X_test = X[len_train:]

In [26]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 7613 entries, 1 to 10873
Series name: target
Non-Null Count  Dtype  
--------------  -----  
7613 non-null   float64
dtypes: float64(1)
memory usage: 119.0 KB


In [27]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)
vectorizer.fit(X_test)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [28]:
model = LogisticRegression()

In [29]:
model.fit(X_train,y_train)

LogisticRegression()

In [39]:
X_train_prediction = model.predict(X_train)
training_data_f1_score =f1_score(X_train_prediction, y_train) 

In [40]:
training_data_f1_score

0.8318730368655977

In [32]:
y_prediction = model.predict(X_test)

In [42]:
res = pd.DataFrame(y_prediction)
res.index = tweets_test_data.index
res.columns = ['target']
res.to_csv('C:/Users/Admin/Downloads/DisasterTweets.csv')