<img src="../../img/nlp.png" width="600" height="270">  

In [122]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [123]:
# encoding = latin1 --> data içinde latin alfabsi var
# r --> read
data = pd.read_csv(r"../../data/gender_classifier.csv",encoding = "latin1")

In [124]:
# 2 tane serious veya data frame birleştirmek
data = pd.concat([data.gender,data.description],axis=1)

In [125]:
data

Unnamed: 0,gender,description
0,male,i sing my own rhythm.
1,male,I'm the author of novels filled with family dr...
2,male,louis whining and squealing and all
3,male,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,female,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...
...,...,...
20045,female,(rp)
20046,male,"Whatever you like, it's not a problem at all. ..."
20047,male,#TeamBarcelona ..You look lost so you should f...
20048,female,Anti-statist; I homeschool my kids. Aspiring t...


In [126]:
data.dropna(axis = 0,inplace = True)

In [127]:
data.gender = [ 1 if each == "female" else 0 for each in data.gender]

## REGULAR EXPRESSION (RE)

Textler clean edilecek.

### Cleaning Data

In [128]:
import re

In [129]:
first_description = data.description[4]
first_description

'Ricky Wilson The Best FRONTMAN/Kaiser Chiefs The Best BAND Xxxx Thank you Kaiser Chiefs for an incredible year of gigs and memories to cherish always :) Xxxxxxx'

In [130]:
#a' dan z'ye-A'dan Z'ye olmayanları boşlukla değiştir.
description = re.sub("[^a-z A-Z]","",first_description)
description

'Ricky Wilson The Best FRONTMANKaiser Chiefs The Best BAND Xxxx Thank you Kaiser Chiefs for an incredible year of gigs and memories to cherish always  Xxxxxxx'

In [131]:
# Tüm harfleri küçük harflere çevirir.
description = description.lower()
description

'ricky wilson the best frontmankaiser chiefs the best band xxxx thank you kaiser chiefs for an incredible year of gigs and memories to cherish always  xxxxxxx'

## IRREVALANT WORDS (STOPWORDS)
Gereksiz kelimeler.

In [132]:
# natural labguage tool kit
import nltk 

In [133]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/irem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [134]:
from nltk.corpus import stopwords

In [135]:
# Kelimeleri ayırır ve bir listenn içinde depolar.
# description = description.split()

# Split yerine tokenizer kullanılabilir.

In [136]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/irem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [137]:
nltk.data.path.append("/home/irem/Desktop/machine_learning/machine_learning/data/nltk_data")

In [138]:
# Kelimeleri ayırır ve bir listenn içinde depolar.
description = nltk.word_tokenize(description)

In [139]:
stre = "shouldn't ve guzel"

In [140]:
# Örneğin should not algılanamaz. 
stre.split()

["shouldn't", 've', 'guzel']

In [141]:
# nltk ile kelimeler algılanır.
nltk.word_tokenize(stre)

['should', "n't", 've', 'guzel']

### Gereksiz kelimeleri çıkar

In [142]:
# set --> Tekrar eden kelimeleri set et.
description = [ word  for word in description if not word in set(stopwords.words("english"))]

## LEMMATIZATION

Kelimlerin köklerini bulma.

loved - love  gitmeyeceğim - git

In [143]:
import nltk as nlp 

In [144]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/irem/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [145]:
lemma = nlp.WordNetLemmatizer()
description = [ lemma.lemmatize(word) for word in description ]

In [146]:
# description içindeki herbir kelimeyi boşlukla birleştir ardından tekrar text oluşur.
description = "".join(description)

Yukarıda sadece 4. data için yapılan işlem aşağıda tüm data için yapılacak.

## DATA CLEANING

In [147]:
description_list = []

for description in data.description:

    # [^a-z A-Z] aralığında olmayanları boşlukla değiştir.
    description = re.sub("[^a-z A-Z]","",description)

    # Tüm data küçük harf.
    description = description.lower()

    # Parçalar.
    description = nltk.word_tokenize(description)


    description = [ word  for word in description if not word in set(stopwords.words("english"))]

    # Kök bulma
    lemma = nlp.WordNetLemmatizer()
    description = [ lemma.lemmatize(word) for word in description ]

    # Boşlukla birleştir tekrara bi araya getir.
    description = " ".join(description)
    description_list.append(description)

## BAG OF WORDS

<img src="../../img/nlp_1.png" width="800" height="270">  

In [148]:
from sklearn.feature_extraction.text import CountVectorizer

In [149]:
# data içinde bulunan 32000 kelimeden 500 tanesini seç.
max_features = 500

In [150]:
# stop_words= "english" --> İngilizce de gereksiz kelimeleri at.
# lowercase= --> Burada kullanılarak da tüm harfler küçük yapılabilir.
# tokken_pattern kullanılarak da gereksiz karakterler kaldırılabilir fakat yukarıda "description = re.sub("[^a-z A-Z]","",description)" kullanıldığında ekleme yapılmadı.
count_vectorizer = CountVectorizer(max_features = max_features,stop_words= "english")

In [151]:
# sparce_matrix --> Yukarıda verilen img içindeki 1 ve 0'lara sparce_matrix denir.
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray() # x feature

print("en sik kullanilan {} kelimeler: {}".format(max_features, count_vectorizer.get_feature_names_out()))

en sik kullanilan 500 kelimeler: ['account' 'activist' 'addict' 'adult' 'adventure' 'advocate' 'aka' 'alum'
 'amazing' 'america' 'american' 'angel' 'animal' 'anime' 'app' 'area'
 'art' 'artist' 'ask' 'aspiring' 'author' 'award' 'away' 'awesome' 'baby'
 'bad' 'band' 'based' 'beautiful' 'beauty' 'beer' 'believe' 'best'
 'better' 'big' 'bio' 'bit' 'bitch' 'black' 'blog' 'blogger' 'blue' 'book'
 'booking' 'born' 'bot' 'boy' 'brand' 'breaking' 'building' 'business'
 'buy' 'car' 'care' 'cat' 'cause' 'ceo' 'change' 'channel' 'check'
 'chicago' 'chief' 'child' 'christ' 'christian' 'city' 'class' 'club'
 'coach' 'coffee' 'college' 'come' 'comic' 'coming' 'communication'
 'community' 'company' 'computer' 'conservative' 'consultant' 'contact'
 'content' 'continuous' 'control' 'cool' 'country' 'county' 'cover'
 'crazy' 'create' 'creative' 'creator' 'culture' 'current' 'currently'
 'dad' 'daily' 'dance' 'data' 'day' 'deal' 'dedicated' 'design' 'designer'
 'developer' 'development' 'didnt' 'die' 'di

## TEXT CLASSIFICATION

In [152]:
y = data.iloc[:,0].values # male or female classes
x = sparce_matrix

In [153]:
# train test split
from sklearn.model_selection import train_test_split

In [154]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state=42)

### Naive Bayes Modeli

In [155]:
from sklearn.naive_bayes import GaussianNB

In [156]:
nb = GaussianNB()

In [157]:
nb.fit(x_train,y_train)

In [162]:
# prediction 
y_pred = nb.predict(x_test)

In [163]:
print("accuracy: ",nb.score(y_pred.reshape(-1,1),y_test))

ValueError: X has 5354 features, but GaussianNB is expecting 500 features as input.