In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import numpy as np
import pandas as pd

In [3]:
train_data = pd.read_csv(r"/kaggle/input/nlp-getting-started/train.csv")
test_data = pd.read_csv(r"/kaggle/input/nlp-getting-started/test.csv")

# Preprocessing

In [4]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train_data.shape

(7613, 5)

In [6]:
x_text = train_data["text"]

In [7]:
import re

def cleaning(sentence):
    
    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"can\'t", "can not", sentence)
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = sentence.lower()                 # Converting to lowercase
    sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)        #Removing Punctuations
    return sentence

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [9]:
snow = nltk.stem.SnowballStemmer('english')

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 

stop_words = set(stopwords.words('english'))       # set is used as we have to use 'not in' (to iterate) function

temp = []

for each_sentence in x_text:
    each_sentence = cleaning(each_sentence)
    each_word = [snow.stem(word) for word in each_sentence.split() if word  not in stop_words]
    temp.append(each_word)   

In [11]:
temp[0:2]

[['deed', 'reason', 'earthquak', 'may', 'allah', 'forgiv', 'us'],
 ['forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada']]

In [12]:
# only to remove quotes for each word

final_word = []

for row in temp:
    seq = ''
    for word in row:
        seq = seq + ' ' + word
    final_word.append(seq)

In [13]:
final_word[0:2]

[' deed reason earthquak may allah forgiv us',
 ' forest fire near la rong sask canada']

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

converter = CountVectorizer()
x = converter.fit_transform(final_word)

In [15]:
x = x.toarray()

In [16]:
y = train_data["target"]

In [17]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3 , random_state = 0)

print("x_train",x_train.shape)
print("x_test",x_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

x_train (5329, 13366)
x_test (2284, 13366)
y_train (5329,)
y_test (2284,)


# Building model

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

model_lr = LogisticRegression(max_iter = 1000)

hyperparmeter_C = {'C':[0.1]}

grid_search = GridSearchCV(estimator = model_lr , param_grid = hyperparmeter_C)

grid_search.fit(x_train,y_train)

grid_search.best_params_

print("Best C Value is ",grid_search.best_params_)

print("test accuracy ",(grid_search.score(x_test,y_test))*float(100))

Best C Value is  {'C': 0.1}
test accuracy  81.26094570928196


# For test data

In [19]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [20]:
x_test_text = test_data["text"]

In [21]:
temp_test = []

for each_sentence in x_test_text:
    each_sentence = cleaning(each_sentence)
    each_word = [snow.stem(word) for word in each_sentence.split() if word  not in stop_words]
    temp_test.append(each_word) 

In [22]:
temp_test[0:2]

[['happen', 'terribl', 'car', 'crash'],
 ['heard', 'earthquak', 'differ', 'citi', 'stay', 'safe', 'everyon']]

In [23]:
# only to remove quotes for each word

final_word_test = []

for row in temp_test:
    seq = ''
    for word in row:
        seq = seq + ' ' + word
    final_word_test.append(seq)

In [24]:
final_word_test[0:2]

[' happen terribl car crash', ' heard earthquak differ citi stay safe everyon']

In [25]:
converter.fit(final_word)
test_transformed = converter.transform(final_word_test)

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

model_lr = LogisticRegression(max_iter = 1000)

hyperparmeter_C = {'C':[0.1]}

grid_search = GridSearchCV(estimator = model_lr , param_grid = hyperparmeter_C)

grid_search.fit(x,y)


final_prediction = grid_search.predict(test_transformed)

In [27]:
final_prediction

array([1, 1, 1, ..., 1, 1, 0])

In [28]:
pred=pd.DataFrame(final_prediction)

sub_df=pd.read_csv(r"/kaggle/input/nlp-getting-started/sample_submission.csv")
datasets=pd.concat([sub_df['id'],pred],axis=1)
datasets.columns=['id','target']
datasets.to_csv('submission.csv',index=False)