In [1]:
#pip install kaggle


In [2]:
##!kaggle competitions download -c nlp-getting-started


In [3]:
import pandas as pd
import numpy as np

In [4]:
train_df = pd.read_csv("nlp-getting-started/train.csv")
test_df = pd.read_csv("nlp-getting-started/test.csv")

In [5]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
train_df.drop(columns=["keyword","location"],inplace=True)
test_df.drop(columns=["keyword","location"],inplace=True)

In [8]:
train_df.tail()

Unnamed: 0,id,text,target
7608,10869,Two giant cranes holding a bridge collapse int...,1
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,Police investigating after an e-bike collided ...,1
7612,10873,The Latest: More Homes Razed by Northern Calif...,1


In [9]:
test_df.tail()

Unnamed: 0,id,text
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,Storm in RI worse than last hurricane. My city...
3260,10868,Green Line derailment in Chicago http://t.co/U...
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...
3262,10875,#CityofCalgary has activated its Municipal Eme...


In [10]:
import re 
import nltk 
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
all_stopwords = stopwords.words("english")
all_stopwords.remove("not")

stopwords_set = set(all_stopwords)

corpus = []
for i in range(len(train_df["text"])):
    text = re.sub('[^a-zA-Z]', ' ', train_df['text'][i])  # Sadece harfler kalsın
    text = text.lower()  # Hepsini küçük harfe çevir
    text = text.split()  # Kelimelere ayır
    # Stopword olmayan kelimeleri lemmatize et ve tekrar birleştir
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords_set]
    text = " ".join(text)
    corpus.append(text)


[nltk_data] Downloading package stopwords to C:\Users\BAHA
[nltk_data]     ENES\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\BAHA
[nltk_data]     ENES\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\BAHA
[nltk_data]     ENES\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
print(corpus)



In [12]:
from sklearn.feature_extraction.text import CountVectorizer
y = train_df.iloc[:, -1].values
X = corpus

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', XGBClassifier())
])

param_grid = {"vectorizer__max_features":[1000,1500,2000,2500,3000,3500,4000,4500]}

grid_search =GridSearchCV(pipeline,param_grid,cv=10,n_jobs=-1,verbose=2,scoring="accuracy")

grid_search.fit(X_train,y_train)

print("En iyi parametreler: ", grid_search.best_params_)
print("En iyi doğruluk skoru: ", grid_search.best_score_)

y_pred = grid_search.predict(X_test)

# Confusion matrix ve accuracy score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: ", accuracy_score(y_test, y_pred))


Fitting 10 folds for each of 8 candidates, totalling 80 fits
En iyi parametreler:  {'vectorizer__max_features': 2000}
En iyi doğruluk skoru:  0.7917898193760263
[[783  91]
 [234 415]]
Accuracy:  0.7866053841103086


In [16]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

param_grid = {"vectorizer__max_features":[1000,1500,2000,2500,3000,3500,4000,4500]}

grid_search =GridSearchCV(pipeline,param_grid,cv=10,n_jobs=-1,verbose=2,scoring="accuracy")

grid_search.fit(X_train,y_train)

print("En iyi parametreler: ", grid_search.best_params_)
print("En iyi doğruluk skoru: ", grid_search.best_score_)

y_pred = grid_search.predict(X_test)

# Confusion matrix ve accuracy score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: ", accuracy_score(y_test, y_pred))


Fitting 10 folds for each of 8 candidates, totalling 80 fits
En iyi parametreler:  {'vectorizer__max_features': 2500}
En iyi doğruluk skoru:  0.7990147783251232
[[738 136]
 [183 466]]
Accuracy:  0.7905449770190414


In [17]:
test_corpus = []
for i in range(len(test_df["text"])):
    text = re.sub('[^a-zA-Z]', ' ', test_df['text'][i])  # Sadece harfler kalsın
    text = text.lower()  # Hepsini küçük harfe çevir
    text = text.split()  # Kelimelere ayır
    # Stopword olmayan kelimeleri lemmatize et ve tekrar birleştir
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords_set]
    text = " ".join(text)
    test_corpus.append(text)

# 3. Model ile tahmin yap
y_test_pred = grid_search.predict(test_corpus)
test_df["target"] = y_test_pred

In [18]:
test_df.drop(columns=["text"],inplace=True)
test_df.to_csv("test_predictions.csv",index = False)