# Afet Tweetleriyle Doğal Dil İşleme
Hangi Tweetlerin gerçek felaketlerle ilgili olduğunu, hangilerinin olmadığını tahmin edin

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
tw=pd.read_csv("afettrain.csv")

In [3]:
tw.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
tw.shape

(7613, 5)

In [5]:
tw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
tw.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
# Bir NLP Projesine başlarken dikkat edilmesi gerekenler
#### 1-Yazıdaki tüm harfleri küçük harfe çevir 
#### 2-Noktalama işaretlerini kaldır
#### 3-Rakamları kaldır
#### 4-Satır sonlarını kaldır
#### 5-Gereksiz kelimeleri çıkart (Stopwords)
#### 6-Tokenize et
#### 7-Ekleri kaldır, kökleri bul (Lemma&Stemma)
#### 8-Vektörize et

In [8]:
#1-küçük harfe çevireceğiz
tw["text"]=tw["text"].str.lower()
#2-noktalama işaretlerini kaldıracağız
tw['text']=tw['text'].str.replace('[^\w\s]','')
tw['text']=tw['text'].str.replace('\n','')
#3-rakamları kaldır
tw['text']=tw['text'].str.replace('\d+','')
#4-satır başlarını kaldır
tw['text']=tw['text'].str.replace('\r','')
tw['text'].replace({'r"[\s]+"':''},regex=True,inplace=True)
#Gereksiz boşlukları(whitespaces) kaldırdık

In [9]:
from unicodedata import normalize
tw['text']=tw['text'].apply(lambda text: normalize("NFKD", str(text)).encode("ascii", "ignore").decode("utf-8", "ignore"))
#Aksanları ortadan kaldırdık

In [10]:
# Lang Detect
from langdetect import detect
tw['language']=tw['text'].apply(detect)
tw=tw[tw['language']=='en']
tw

Unnamed: 0,id,keyword,location,text,target,language
0,1,,,our deeds are the reason of this earthquake ma...,1,en
1,4,,,forest fire near la ronge sask canada,1,en
2,5,,,all residents asked to shelter in place are be...,1,en
3,6,,,people receive wildfires evacuation orders in...,1,en
4,7,,,just got sent this photo from ruby alaska as s...,1,en
...,...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1,en
7609,10870,,,aria_ahrary thetawniest the out of control wil...,1,en
7610,10871,,,m utckm s of volcano hawaii httptcozdtoydebj,1,en
7611,10872,,,police investigating after an ebike collided w...,1,en


In [11]:
# 5-Gereksiz kelimeleri çıkart (Stopwords)
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
tw['text']=tw['text'].apply(lambda x:" ".join([i for i in str(x).split(" ")  if i not in stop_words]))
#Gereksiz kelimeleri kaldırdık

In [12]:
# 6-Tokenize et
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
tw['tokenized']=[tokenizer.tokenize(i) for i in tw['text'].values]

In [13]:
# 7-Ekleri kaldır, kökleri bul (Lemma&Stemma)
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
tw['lemmatize']=[[lemmatizer.lemmatize(str(i)) for i in words] for words in tw['text'].values]

In [14]:
from nltk.stem import PorterStemmer
pr=PorterStemmer()

In [15]:
def lemmafn(text):
    words=TextBlob(text).words
    return [pr.stem(word) for word in words]

In [16]:
# Polarity / Subjectivity
from textblob import TextBlob
tw[['polarity','subjectivity']]=tw['text'].apply(lambda t:pd.Series(TextBlob(t).sentiment))

In [17]:
tw.sample(9)

Unnamed: 0,id,keyword,location,text,target,language,tokenized,lemmatize,polarity,subjectivity
3110,4464,electrocuted,not so cool KY,michael talking electrocuted omg rowysolouisvi...,1,en,"[michael, talking, electrocuted, omg, rowysolo...","[m, i, c, h, a, e, l, , t, a, l, k, i, n, g, ...",0.0,0.0
520,752,avalanche,,colorado avalanche mens official colorado aval...,0,en,"[colorado, avalanche, mens, official, colorado...","[c, o, l, o, r, a, d, o, , a, v, a, l, a, n, ...",0.0,0.1
4504,6402,hurricane,"Anderson, SC",hurricane sick,1,en,"[hurricane, sick]","[h, u, r, r, i, c, a, n, e, , s, i, c, k]",-0.714286,0.857143
5332,7611,pandemonium,The P (South Philly),pandemonium use fav cd get httptcowhugaemc,1,en,"[pandemonium, use, fav, cd, get, httptcowhugaemc]","[p, a, n, d, e, m, o, n, i, u, m, , u, s, e, ...",0.0,0.0
4589,6527,injuries,"Georgia, U.S.A.",msnbc fucking idiot gun amp hatchet yet still ...,1,en,"[msnbc, fucking, idiot, gun, amp, hatchet, yet...","[m, s, n, b, c, , f, u, c, k, i, n, g, , i, ...",-0.211111,0.822222
6311,9022,stretcher,,invalid grazed towel stretcher pllolz witter c...,0,en,"[invalid, grazed, towel, stretcher, pllolz, wi...","[i, n, v, a, l, i, d, , g, r, a, z, e, d, , ...",0.0,0.0
1988,2860,damage,My mind is my world,complaining phoenix mode fire emblem turns ray...,0,en,"[complaining, phoenix, mode, fire, emblem, tur...","[c, o, m, p, l, a, i, n, i, n, g, , p, h, o, ...",0.0,0.0
4878,6946,massacre,,cameron_wate parents colorado theater shooting...,1,en,"[cameron_wate, parents, colorado, theater, sho...","[c, a, m, e, r, o, n, _, w, a, t, e, , p, a, ...",-0.075,0.05
4744,6748,lava,"Vancouver, BC",tried making chocolate peanut butter lava cake...,0,en,"[tried, making, chocolate, peanut, butter, lav...","[t, r, i, e, d, , m, a, k, i, n, g, , c, h, ...",0.0,0.0


In [18]:
# yukarıda çıkan sonuçları tek rakama dönüştürelim
tw.loc[tw['subjectivity']>0.5,'sentiment']=1 #pozitive
tw.loc[tw['subjectivity']==0.5,'sentiment']=0 #neutral
tw.loc[tw['subjectivity']<0.5,'sentiment']=-1 #negative
tw

Unnamed: 0,id,keyword,location,text,target,language,tokenized,lemmatize,polarity,subjectivity,sentiment
0,1,,,deeds reason earthquake may allah forgive us,1,en,"[deeds, reason, earthquake, may, allah, forgiv...","[d, e, e, d, s, , r, e, a, s, o, n, , e, a, ...",0.000000,0.000000,-1.0
1,4,,,forest fire near la ronge sask canada,1,en,"[forest, fire, near, la, ronge, sask, canada]","[f, o, r, e, s, t, , f, i, r, e, , n, e, a, ...",0.100000,0.400000,-1.0
2,5,,,residents asked shelter place notified officer...,1,en,"[residents, asked, shelter, place, notified, o...","[r, e, s, i, d, e, n, t, s, , a, s, k, e, d, ...",-0.100000,0.400000,-1.0
3,6,,,people receive wildfires evacuation orders ca...,1,en,"[people, receive, wildfires, evacuation, order...","[ , p, e, o, p, l, e, , r, e, c, e, i, v, e, ...",0.000000,0.000000,-1.0
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1,en,"[got, sent, photo, ruby, alaska, smoke, wildfi...","[g, o, t, , s, e, n, t, , p, h, o, t, o, , ...",0.000000,0.000000,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,,,two giant cranes holding bridge collapse nearb...,1,en,"[two, giant, cranes, holding, bridge, collapse...","[t, w, o, , g, i, a, n, t, , c, r, a, n, e, ...",0.000000,1.000000,1.0
7609,10870,,,aria_ahrary thetawniest control wild fires cal...,1,en,"[aria_ahrary, thetawniest, control, wild, fire...","[a, r, i, a, _, a, h, r, a, r, y, , t, h, e, ...",0.100000,0.400000,-1.0
7610,10871,,,utckm volcano hawaii httptcozdtoydebj,1,en,"[utckm, volcano, hawaii, httptcozdtoydebj]","[ , u, t, c, k, m, , v, o, l, c, a, n, o, , ...",0.000000,0.000000,-1.0
7611,10872,,,police investigating ebike collided car little...,1,en,"[police, investigating, ebike, collided, car, ...","[p, o, l, i, c, e, , i, n, v, e, s, t, i, g, ...",-0.260417,0.583333,1.0


In [19]:
#icmali hali
tw.sentiment.value_counts()

-1.0    5150
 1.0    1879
 0.0     304
Name: sentiment, dtype: int64

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
# 8-Vektörize et
vect=CountVectorizer(stop_words="english",ngram_range=(1,2),max_features=10000,analyzer=lemmafn)

In [23]:
# Modelleme
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [24]:
x=tw.text
y=tw.target

In [25]:
x=vect.fit_transform(x)

In [26]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=.15)

In [27]:
l=LogisticRegression()
d=DecisionTreeClassifier()

In [28]:
l.fit(x_train,y_train)

In [29]:
d.fit(x_train,y_train)

In [30]:
pred=l.predict(x_test)

In [31]:
accuracy_score(pred,y_test)

0.8072727272727273

In [32]:
pred1=d.predict(x_test)

In [33]:
accuracy_score(pred1,y_test)

0.7481818181818182

In [37]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def fnc_all_classification_models(x, y):
    # Veriyi eğitim ve test setlerine bölelim
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # Sınıflandırıcıları tanımlayalım
    classifiers = [
        GaussianNB(),
        LogisticRegression(),
        RandomForestClassifier(),
        SVC(),
        KNeighborsClassifier()
    ]

    accuracy_scores = []

    for classifier in classifiers:
        classifier.fit(x_train.toarray(), y_train)  # Seyrek matrisi yoğun matrise çevir
        predictions = classifier.predict(x_test.toarray())  # Seyrek matrisi yoğun matrise çevir
        accuracy_scores.append((classifier.__class__.__name__, accuracy_score(y_test, predictions)))

    return accuracy_scores

# x ve y giriş özellikleriniz ve etiketleriniz ise
accuracy_scores = fnc_all_classification_models(x, y)
for classifier, accuracy in accuracy_scores:
    print(f"{classifier}: {accuracy}")

GaussianNB: 0.6012269938650306
LogisticRegression: 0.787321063394683
RandomForestClassifier: 0.776414451261077
SVC: 0.7995910020449898
KNeighborsClassifier: 0.6230402181322426


In [39]:
# Test verisini tahmin edelim

In [40]:
tw_test = pd.read_csv("afettest.csv")
tw_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [41]:
tw_test.shape

(3263, 4)

In [44]:
from nltk.tokenize import TweetTokenizer

In [45]:
#Text Preprocessing (adapted to clean Twitter text)
import string

stop_words = set(stopwords.words('english')) #gets the stopword list from the dedicated library and saves them
tk = TweetTokenizer() #defines the object, whose method is called in the function
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
        # Check if the input is a string
    if not isinstance(text, str):
        # Return non-string input as-is or convert to string
        return str(text) if text is not None else ''
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove user mentions
    text = re.sub(r"@\S+", "", text)
    # Remove hashtags
    text = re.sub(r"#\S+", "", text)
    # Remove punctuation
    text = re.sub(f"[{string.punctuation}]", "", text)
    # Remove emojis
    text = emoji.emojize(text, variant='emoji_type')
    # Lowercase the text
    text = text.lower()
    # Tokenize the text
    words = tk.tokenize(text)
    # Lemmatize the text
    words = [lemmatizer.lemmatize(w) for w in words]
    # Remove stop words
    words = [w for w in words if w not in stop_words]
    # Join the tokens back together
    return ' '.join(words)

In [46]:
def preprocess_text(text):
    # Metin ön işleme adımlarını uygula (örneğin, küçük harfe çevirme, özel karakterleri temizleme, vb.)
    # Burada gerçekleştirilecek işlemler projenin gereksinimlerine bağlı olacaktır.
    processed_text = text.lower()  # Örneğin, metni küçük harfe çevirme
    # Diğer metin ön işleme adımlarını buraya ekleyebilirsiniz.
    return processed_text

In [47]:
tw_test['text'] = tw_test['text'].apply(preprocess_text)

In [48]:
# 'tw_test' verilerini vektörleştirin
x_valid = vect.transform(tw_test['text'])

# Tahminleri yapın
y_preds = l.predict(x_valid)

# Tahminleri 'target' sütununa ekleyin
tw_test['predicted_target'] = y_preds

# Tahmin edilen 'target' sütununu içeren bir DataFrame oluşturun
submission_df = tw_test[['id', 'predicted_target']]

# Submission dosyasını CSV olarak kaydedin
submission_df.to_csv('submission.csv', index=False)