# Natural Language Process (NLP)

In [1]:
import pandas as pd

In [2]:
 data=pd.read_csv(r'gc.csv',encoding="latin1") 
#Database icerisinde latin harfleri var hata vermemesi icin encoding="latin1" dedik
#r ise read'in r'si

In [3]:
data = pd.concat([data.gender, data.description],axis=1) 
#iki columndan olusan data olusturduk.
data.dropna(axis=0 ,inplace=True)
#inplace=True yazmasaydık data= data.dropna(axis=0) ayni sey.
data.gender=[ 1 if each =="famale" else 0 for each in data.gender]

In [4]:
data.head()

Unnamed: 0,gender,description
0,0,i sing my own rhythm.
1,0,I'm the author of novels filled with family dr...
2,0,louis whining and squealing and all
3,0,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,0,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


# Regular Expression (RE)

- cleaning data
- regular expression RE mesela "[^a-zA-Z]"

In [5]:
import re
first_description = data.description[4]

In [6]:
first_description

'Ricky Wilson The Best FRONTMAN/Kaiser Chiefs The Best BAND Xxxx Thank you Kaiser Chiefs for an incredible year of gigs and memories to cherish always :) Xxxxxxx'

In [7]:
#buradaki gülücük gibi seyleri cleaning edecegiz.

In [8]:
description = re.sub("[^a-zA-Z]"," ", first_description) 

- ^ bu isaret bulma(-ma olumsuzluk eki) demektir.
- yukarida a'dan z'ye ve büyük a'dan büyük z'ye kadar olan harflerden bulma bulduysan ise boslukla degistir
- bu islemi ise first_description'da yap.

In [9]:
description

'Ricky Wilson The Best FRONTMAN Kaiser Chiefs The Best BAND Xxxx Thank you Kaiser Chiefs for an incredible year of gigs and memories to cherish always    Xxxxxxx'

- buyuk harf ve kucuk harfle ayni seyi bile yazsak bilgisayar dilinde farkli kelimelermis gibi algilanir simdi bunu duzeltecegiz.

In [10]:
description = description.lower() #her seyi kucuk harfe cevirme

In [11]:
description

'ricky wilson the best frontman kaiser chiefs the best band xxxx thank you kaiser chiefs for an incredible year of gigs and memories to cherish always    xxxxxxx'

# Stopwords (Irrelavant Words)
- gereksiz kelimeler (the,and,as gibi..)

In [12]:
import nltk #natural language tool kit
nltk.download("stopwords") #corpus diye bir klasore indiriliyor.
from nltk.corpus import stopwords #sonra corpus klasorunden import ediliyor.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bestecetin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


- once metinleri kelime kelime ayiracagiz daha sonra stopwords var mi diye kiyaslayacagiz.

In [13]:
description = description.split()

In [14]:
description

['ricky',
 'wilson',
 'the',
 'best',
 'frontman',
 'kaiser',
 'chiefs',
 'the',
 'best',
 'band',
 'xxxx',
 'thank',
 'you',
 'kaiser',
 'chiefs',
 'for',
 'an',
 'incredible',
 'year',
 'of',
 'gigs',
 'and',
 'memories',
 'to',
 'cherish',
 'always',
 'xxxxxxx']

- split metindeki kelimeleri bir liste icine koyarak ayirdi.
- split( ) dedik parantez icinde bi sey olmamasi default olarak bosluklara gore ayirmasini ifade ediyor.
- split yerine tokenizer kullanabiliriz. Nltk kutuphanesine ait.

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bestecetin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
description = nltk.word_tokenize(description)

- split yerine tokenizer neden kullanilir?

In [17]:
stre= "shouldn't and good"
stre.split()

["shouldn't", 'and', 'good']

In [18]:
stre2 = "shouldn't and good"
stre2 = nltk.word_tokenize(stre2)
stre2

['should', "n't", 'and', 'good']

- aslinda shouldn't kelimesi should ve not olarak ayri kelimelerdir bunu split duzgun ayirmiyor fakat tokenizer not'i ayiriyor.

#gereksiz kelimeleri cikar

In [19]:
description = [word for word in description if not word in set(stopwords.words("english"))]

- description'ın icindeki wordleri dolan daha sonra bana word'u dondur, set et yani uniqueleri bul yani tekrar eden kelimeler varsa set et.

In [20]:
print(description)

['ricky', 'wilson', 'best', 'frontman', 'kaiser', 'chiefs', 'best', 'band', 'xxxx', 'thank', 'kaiser', 'chiefs', 'incredible', 'year', 'gigs', 'memories', 'cherish', 'always', 'xxxxxxx']


- Textin icinden the, of, an gibi kelimelerin cikarildigi gorulur.

# Lemmatization

- yemeğe, yemeği gibi ekler olunca bilgisayar bunlari farkli algilar bunun yemek oldugunu bilgisayara ogretmemiz gerekir.
- Bu yüzden kelimelerimizin koklerini bulmamiz gerekir, bunun icin Lemmatization kullanacagiz.

In [21]:
from nltk.stem import WordNetLemmatizer

In [22]:
lemma = WordNetLemmatizer()

In [23]:
description = [lemma.lemmatize(word) for word in description]

- description'un icindeki tum kelimeleri dolan ve hepsinin kokunu bul

In [24]:
description

['ricky',
 'wilson',
 'best',
 'frontman',
 'kaiser',
 'chief',
 'best',
 'band',
 'xxxx',
 'thank',
 'kaiser',
 'chief',
 'incredible',
 'year',
 'gig',
 'memory',
 'cherish',
 'always',
 'xxxxxxx']

In [25]:
description = " ".join(description)

- bosluk ile kelimelerimi birleştir.

In [26]:
description

'ricky wilson best frontman kaiser chief best band xxxx thank kaiser chief incredible year gig memory cherish always xxxxxxx'

Yani dataki ;
- gereksiz kelimeler(stop word) ortadan kalkti
- hepsi kucuk harf oldu
- gereksiz karakterler ortadan kalkti
- her bir kelimenin koku yazildi 
- clean data yapilmis oldu

- Bu adimlarin hepsi bir datadaki 4. description icin yapilmis oldu.
- Bunu tum dataya yapmak icin bir for dongusu kullanacagiz.

In [27]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bestecetin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
description_list=[]

In [29]:
for description in data.description:
    description = re.sub("[^a-zA-Z]"," ", description) 
    description = description.lower() 
    description = nltk.word_tokenize(description)
    description = [word for word in description if not word in set(stopwords.words("english"))]
    lemma = WordNetLemmatizer()
    description = [lemma.lemmatize(word) for word in description]
    description = " ".join(description)
    description_list.append(description) 

In [30]:
description_list

['sing rhythm',
 'author novel filled family drama romance',
 'louis whining squealing',
 'mobile guy er shazam google kleiner perkins yahoo sprint pc airtouch air force stanford gsb uva dad husband brother golfer',
 'ricky wilson best frontman kaiser chief best band xxxx thank kaiser chief incredible year gig memory cherish always xxxxxxx',
 'know',
 'global marketplace image video music sharing photo inspiration design tip video creative community',
 'secret getting ahead getting started',
 'pll fan crazy mcd ramen bae',
 'renaissance art historian university nottingham fuelled haribo partial coffee soft spot renaissance china national teaching fellow',
 'clean food taste great providing energy nutrient guilt granola vegan paleo friendly option cert organic gf kosher',
 'highly extraordinary auction',
 'senior xi xii mmxiv',
 'come join fastest blog network online today http co mfpa vgk http co mpuuqtyf g cover credit repair credit card bankruptcy',
 'im p bo burnham disney world',
 

# Bag of Words

3 adet cumlem olsun:
- 1) merhaba ben geldim
- 2) merhaba sen okula git
- 3) merhaba ben okula dondum
- ilk cumlenin her kelimesini feature yapar 
- ikinci kelimede merhaba haric sen okula git kelimeleri yine feature olur
- islem tekrarlanir 
- her satırda bu columnlar guncellenir ve olmayan kelimeler eklenir, olanlar ise isaretlenir.
- yani her bir satır cumleler, her bir sutun ise kelimelerdir.

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
#bag of words yaratmak icin kullanilan method

In [32]:
data.shape

(16224, 2)

bu datada en cok kullanilan 500 kelimeyi sececegiz.

In [33]:
max_features = 500

In [34]:
count_vectorizer = CountVectorizer(max_features=max_features,stop_words="english")

- stop wordler(gereksiz kelimeler) cumlelerin icinden cikarildi.
- ayni zamanda istersek burada kucuk harfe donusturebilirdik lowercase yazarak.
- token_pattern kullanarak nokta unlem gibi seyleri ortadan kaldirabiliriz bunu biz re.sub kullanarak yapmistik.
- bunlarin hepsini CountVectorizer'i kullanarak yapabiliyoruz.

In [35]:
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray()

- methodu cagiriyoruz, fit yapip uyarla bunu sparce_matrix'e esitle

In [36]:
print("en sık kullanilan {} kelimeler {}".format(max_features,count_vectorizer.get_feature_names()))

en sık kullanilan 500 kelimeler ['account', 'activist', 'actor', 'addict', 'adult', 'adventure', 'advocate', 'alum', 'amazing', 'america', 'american', 'angel', 'animal', 'anime', 'app', 'area', 'art', 'artist', 'ask', 'aspiring', 'author', 'award', 'away', 'awesome', 'baby', 'bad', 'band', 'based', 'beautiful', 'beauty', 'beer', 'believe', 'best', 'better', 'big', 'bio', 'bit', 'bitch', 'black', 'blog', 'blogger', 'blue', 'book', 'booking', 'born', 'bot', 'boy', 'brand', 'breaking', 'building', 'business', 'ca', 'car', 'care', 'cat', 'cause', 'ceo', 'certified', 'change', 'channel', 'check', 'chicago', 'chief', 'child', 'christ', 'christian', 'city', 'class', 'club', 'coach', 'coffee', 'college', 'com', 'come', 'comic', 'coming', 'communication', 'community', 'company', 'computer', 'conservative', 'consultant', 'contact', 'content', 'continuous', 'control', 'cool', 'country', 'county', 'crazy', 'create', 'creative', 'creator', 'culture', 'currently', 'dad', 'daily', 'dance', 'data', 'd

- x'imiz sparce_matrix olacak.

In [37]:
y = data.iloc[:,0].values #male or female
x = sparce_matrix

In [38]:
#train test split
from sklearn.model_selection import train_test_split

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1,random_state=42)

In [40]:
#naive bayes

In [44]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train,y_train)

GaussianNB()

In [45]:
#prediction
y_pred = nb.predict(x_test)

In [46]:
#accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(accuracy))

Accuracy: 1.0
