<a href="https://colab.research.google.com/github/Yakupayaz/NLP/blob/master/Data_Cleaning_Vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning and Vectorization For NLP

## Install and Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)

# **Correction Text**

In [None]:
!pip install autocorrect
# Yanlış yazımları düzeltir.
# large language modelleri çıktıktan sonra pek kullanılmıyor. Başarı oranı düşük
# Daha çok ingilizce dili için kullanılır. Başarı oranı daha iyi



In [None]:
from autocorrect import Speller

In [None]:
spell = Speller(lang='en')

# https://github.com//filyp/autocorrect sitesine gidip diğer hangi dilleri desteklediğini görebiliriz.

In [None]:
%%time
text = "Oah man, this is pretty cooll. We wilel do more sucha things. He haasn't Ali's 2 bookss.  2.1 ½ % ()"
sample_text = spell(text)
sample_text

CPU times: user 1.82 ms, sys: 0 ns, total: 1.82 ms
Wall time: 1.83 ms


"Oh man, this is pretty cool. We will do more such things. He hasn't Ali's 2 books.  2.1 ½ % ()"

In [None]:
%%time
spell = Speller(lang='en', fast=True)
sample_text = spell(text)
sample_text

CPU times: user 74.1 ms, sys: 8 ms, total: 82.1 ms
Wall time: 82.2 ms


"Oh man, this is pretty cool. We will do more such things. He hasn't Ali's 2 books.  2.1 ½ % ()"

# **Acronyms/Conntractions of Text**



In [None]:
!pip install contractions

#https://github.com/kootenpv/contractions
# Kısaltmaları açık hale getirir.
# you're => you are



In [None]:
import contractions

In [None]:
sample_text = contractions.fix(sample_text)
sample_text
# hasn't=> has not kelimesini düzeltti.

"Oh man, this is pretty cool. We will do more such things. He has not Ali's 2 books.  2.1 ½ % ()"

In [None]:
contractions.fix("u're")

'you are'

In [None]:
contractions.fix(contractions.fix("u're"))

'you are'

In [None]:
contractions.add("u're", "you are")
# Kelimeyi 2 defa veya daha fazle bir şekilde çalıştırmak yerine kütüphaneye ekleyerek tek seferde çalıştırabiliriz.

In [None]:
contractions.fix("u're")

'you are'

In [None]:
!pip install nltk



## Tokenization

In [None]:
import nltk

In [None]:
nltk.download('punkt') # sentence ve word tokenleştirme işlemleri için gerekli olan dosyaları yükler.
nltk.download('stopwords') # stopword datasetleri yükler.
nltk.download('wordnet') # normalizasyon için gerkli olan dosyaları yükler.
nltk.download('omw-1.4')

# own-1.4 klasörü kelimenin orjinal hali ile kökene inmiş hali arasındaki ilişkiyi korur.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

# sent_tokenize : Cümle olarak tokenlerine ayırır.
# word_tokenize : Kelime olarak tokenlerine ayırır.

In [None]:
sentence_token = sent_tokenize(sample_text.lower())
sentence_token

['oh man, this is pretty cool.',
 'we will do more such things.',
 "he has not ali's 2 books.",
 '2.1 ½ % ()']

In [None]:
word_token = word_tokenize(sample_text.lower())
word_token

['oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 'he',
 'has',
 'not',
 'ali',
 "'s",
 '2',
 'books',
 '.',
 '2.1',
 '½',
 '%',
 '(',
 ')']

## Removing Punctuation and Numbers

In [None]:
tokens_without_punc = [w for w in word_token if w.isalpha()]
tokens_without_punc
 # isaplha() bir tokenin string olup olmadığına bakar. String ise true, değilse false döndürür.
 # Yukarıdaki komutla birlikte filtreleme yaptık.

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 'he',
 'has',
 'not',
 'ali',
 'books']

## Removing Stopwords

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words("english")


In [None]:
tokens_without_punc

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 'he',
 'has',
 'not',
 'ali',
 'books']

In [None]:
token_without_sw = [t for t in tokens_without_punc if t not in stop_words]

token_without_sw

# Veriden stopwords kelimeleri temizledik.
# Duygu analizi yapılacaksa olumsuz yardımcı kelimeler temizlenmez.
# Noktalama ve özel karakterler temizlendi.

['oh', 'man', 'pretty', 'cool', 'things', 'ali', 'books']

## Data Normalization-Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

# Tokenleri kökleri indirgeme işlemi yapar.
# Anlam kaybı olup olmadığına bakar.

In [None]:
WordNetLemmatizer().lemmatize("drive")

'drive'

In [None]:
samples = [ "driving", "drivers", "driver", "drives", "drove", "driven"]
for i in samples:
  print(WordNetLemmatizer().lemmatize(i))

driving
driver
driver
drive
drove
driven


In [None]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]

In [None]:
token_without_sw


['oh', 'man', 'pretty', 'cool', 'things', 'ali', 'books']

In [None]:
lem

['oh', 'man', 'pretty', 'cool', 'thing', 'ali', 'book']

## Data Normalization-Stemming

In [None]:
from nltk.stem import PorterStemmer
# Anlam kaybı var mı yok mu bakmaz.
# Kelimeyi kökenine kadar indirir.

In [None]:
PorterStemmer().stem("driving")

'drive'

In [None]:
samples = [ "driving", "drivers", "driver", "drives", "drove", "driven"]
for i in samples:
  print(PorterStemmer().stem(i))

drive
driver
driver
drive
drove
driven


In [None]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [None]:
token_without_sw

['oh', 'man', 'pretty', 'cool', 'things', 'ali', 'books']

In [None]:
stem

['oh', 'man', 'pretti', 'cool', 'thing', 'ali', 'book']

## Joining

In [None]:
" ".join(lem)

'oh man pretty cool thing ali book'

## Cleaning Function - for classification (NOT for sentiment analysis)

In [None]:
def cleaning(data):
    import contractions
    from autocorrect import Speller
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer

    #1. Correction of Text
    spell = Speller(lang='en', fast=True)
    correction_data = spell(data)

    #2. Contraction of Text
    contraction_data = contractions.fix(correction_data)

    #3. Tokenize and lower
    text_tokens = word_tokenize(contraction_data.lower())

    #4. Remove Puncs and numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]

    #5. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]

    #6. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]

    #joining
    return " ".join(text_cleaned)

  # Duygu analizi yapılmayacaksa bunu kullanıp veri temizlik yapabiliriz.

In [None]:
pd.Series(sample_text).apply(cleaning) #df["text"].apply(cleaning)

0    oh man pretty cool thing ali book
dtype: object

## Cleaning Function - for sentiment analysis

In [None]:
sample_text= "Oh man, this is pretty cool. We will do more such things. don't aren't are not. no problem. it isn't problem"

In [None]:
word = word_tokenize(sample_text.lower())
word

['oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 'do',
 "n't",
 'are',
 "n't",
 'are',
 'not',
 '.',
 'no',
 'problem',
 '.',
 'it',
 'is',
 "n't",
 'problem']

In [None]:
def cleaning_fsa(data):
    import contractions
    from autocorrect import Speller
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    #1. Correction of Text
    spell = Speller(lang='en', fast=False)
    correction_data = spell(data)

    #2. Contraction of Text
    contraction_data = contractions.fix(correction_data)
    #3.Tokenize and lower
    text_tokens = word_tokenize(contraction_data.lower())
    #4. Remove Puncs and numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    #5. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    #6.Lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    #joining
    return " ".join(text_cleaned)
    # duygu analizi için bu fonksiyon kullanılır.

In [None]:
np.array(pd.Series(sample_text).apply(cleaning_fsa))

array(['oh man pretty cool thing problem problem'], dtype=object)

## CountVectorization and TF-IDF Vectorization

In [None]:
df = pd.read_csv("/content/sample_data/airline_tweets.csv")

In [None]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
df = df[['airline_sentiment','text']]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [None]:
df = df.sample(50)
df.head()
# Uzun sürdüğü için örnek kısmını çalıştırdım.

Unnamed: 0,airline_sentiment,text
9598,neutral,@USAirways I just reserved a flight with my c...
9826,negative,@USAirways when your routing system goes down ...
4141,neutral,@united I believe you have to follow me in ord...
13653,negative,@AmericanAir @robinreda being stuck two days i...
13881,negative,@AmericanAir is the worst airline to ever tra...


In [None]:
df2 = df.copy()

In [None]:
%%time
df2["text"] = df2["text"].apply(cleaning_fsa)

# temizlik işlemi yapıyoruz.

CPU times: user 24.8 s, sys: 126 ms, total: 24.9 s
Wall time: 35 s


In [None]:
df2.head()

Unnamed: 0,airline_sentiment,text
9598,neutral,airway reserved flight companion certificate m...
9826,negative,airway routing system go ground long haul flig...
4141,neutral,united believe follow order send de
13653,negative,americanair robinreda stuck two day airport so...
13881,negative,americanair worst airline ever travel worst eq...


## CountVectorization

In [None]:
X = df2["text"]
y = df2["airline_sentiment"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, stratify = y, random_state = 42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
X_train

11214    airway phone support revoked orig flt late fli...
13881    americanair worst airline ever travel worst eq...
14081    americanair jameswester see james dedicated in...
12994           americanair phone minute said would advice
2160                           united two seat avail far u
6792     jetblue askamex alex plat holder provided acce...
6663     southwestair thank twitter say de someone unle...
9576     airway appalled departure messed return suppos...
12114                         americanair status flight su
10163    airway mmm flight even really exist create las...
5964     southwestair please accept apology lame childi...
4878               southwestair e please get companion pas
1968                                          united thank
2742     united bump group forced check bag wait minute ga
12172                              americanair thanks much
139      virginamerica book seat flight buy even check ...
5463               southwestair fortunemagazine great ne

In [None]:
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

# data leakagenin önüne geçmek için trainde fit_transform kullandık, test datasında da trasform kullandık.
#countvectorizer fit'in yaptığı işlem bütün unique tokenleri tespit eder.

#countvectorizer transformum yapıtığı işlem ;
# Her unique tokenin her yorumda kaçar defa geçtiğini tespit eder.

# Dönüşümler traindatasındaki unique tokenlere göre yapıldığından train datasında geçen unique tokenler az olursa test datamıza
# yapılan dönüşüm esnasında test datasındaki çoğu tokenin ignore olmasına sebebiyet verebiliriz. Bunun önüne geçmek için
# train datasının tüm unique tokenleri ihtiva edecek kadar büyüük olması gerekir.

In [None]:
vectorizer.get_feature_names_out()
# bütün unique tokenler

array(['accept', 'access', 'addition', 'advice', 'ago', 'airline',
       'airport', 'airspace', 'airway', 'alex', 'alone', 'americanair',
       'amp', 'anxiety', 'apology', 'app', 'appalled', 'arrive', 'artman',
       'askamex', 'assistance', 'avail', 'avoid', 'back', 'bag', 'beta',
       'billion', 'board', 'boarding', 'book', 'bump', 'buy', 'call',
       'certificate', 'check', 'childish', 'closed', 'coming',
       'companion', 'create', 'creates', 'credit', 'crew', 'customer',
       'day', 'de', 'dedicated', 'departure', 'deserve', 'door', 'dumb',
       'earlier', 'eat', 'equivalent', 'europe', 'even', 'ever', 'every',
       'exist', 'failover', 'far', 'finally', 'find', 'flight', 'flip',
       'flt', 'fly', 'follow', 'follows', 'forced', 'fortunemagazine',
       'frustrated', 'fuel', 'fw', 'ga', 'gate', 'get', 'glad', 'go',
       'going', 'great', 'ground', 'group', 'haul', 'hold', 'holder',
       'inbound', 'instagram', 'ipad', 'iphone', 'irt', 'james',
       'jamesw

In [None]:
X_train_count.toarray()
# direk sonuç döndürmek için array'e çeviriyoruz.

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 2, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [None]:
df_train_count = pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names_out(), index=X_train.index)
df_train_count.head(5)

Unnamed: 0,accept,access,addition,advice,ago,airline,airport,airspace,airway,alex,alone,americanair,amp,anxiety,apology,app,appalled,arrive,artman,askamex,assistance,avail,avoid,back,bag,...,taken,telephone,thank,thanks,think,thought,told,travel,tweet,twitter,two,tx,united,unless,usairsucks,verification,virginamerica,wait,way,wd,well,working,worst,would,yet
11214,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13881,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0
14081,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12994,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2160,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
X_train

11214    airway phone support revoked orig flt late fli...
13881    americanair worst airline ever travel worst eq...
14081    americanair jameswester see james dedicated in...
12994           americanair phone minute said would advice
2160                           united two seat avail far u
6792     jetblue askamex alex plat holder provided acce...
6663     southwestair thank twitter say de someone unle...
9576     airway appalled departure messed return suppos...
12114                         americanair status flight su
10163    airway mmm flight even really exist create las...
5964     southwestair please accept apology lame childi...
4878               southwestair e please get companion pas
1968                                          united thank
2742     united bump group forced check bag wait minute ga
12172                              americanair thanks much
139      virginamerica book seat flight buy even check ...
5463               southwestair fortunemagazine great ne

In [None]:
X_train[11214]

'airway phone support revoked orig flt late flight made'

In [None]:
df_test_count = pd.DataFrame(X_test_count.toarray(), columns = vectorizer.get_feature_names_out(), index = X_test.index)
df_test_count.head()

Unnamed: 0,accept,access,addition,advice,ago,airline,airport,airspace,airway,alex,alone,americanair,amp,anxiety,apology,app,appalled,arrive,artman,askamex,assistance,avail,avoid,back,bag,...,taken,telephone,thank,thanks,think,thought,told,travel,tweet,twitter,two,tx,united,unless,usairsucks,verification,virginamerica,wait,way,wd,well,working,worst,would,yet
11136,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5519,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4361,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
X_test

11136    airway person check people plane board minute ...
7988     jetblue second incident lost baggage sent de t...
5519     southwestair thanks concerned conflicting text...
7136                         okay jetblue fleet fleet http
4361                 southwestair usd always fly southwest
16       virginamerica excited first cross country flig...
4072     united annricord agent said would refused agen...
6239       southwestair really craving pretzel please send
13782    americanair airway meal also u company care cu...
4141                   united believe follow order send de
14459    americanair think bt help need go ay way get n...
9581     airway americanair hold hour yesterday speakin...
11337       airway sitting tarlac min announcement sitting
4709     southwestair arrived late flight pilot got u i...
1396     united horrible attitude staff delay level ser...
6987                jetblue know flight deemed dark flight
11467                                spend hou hold airw

In [None]:
X_test.iloc[0]

'airway person check people plane board minute telling people americanair exec bene honored'

In [None]:
vectorizer.vocabulary_
# Unique tokenler kaçar defa geçtiğini gösteriyor.

{'airway': 8,
 'phone': 129,
 'support': 164,
 'revoked': 147,
 'orig': 124,
 'flt': 65,
 'late': 99,
 'flight': 63,
 'made': 108,
 'americanair': 11,
 'worst': 189,
 'airline': 5,
 'ever': 56,
 'travel': 174,
 'equivalent': 53,
 'irt': 90,
 'jameswester': 92,
 'see': 154,
 'james': 91,
 'dedicated': 46,
 'inbound': 86,
 'line': 102,
 'telephone': 168,
 'eat': 52,
 'billion': 26,
 'profit': 139,
 'minute': 113,
 'said': 150,
 'would': 190,
 'advice': 3,
 'united': 179,
 'two': 177,
 'seat': 152,
 'avail': 21,
 'far': 60,
 'jetblue': 93,
 'askamex': 19,
 'alex': 9,
 'plat': 134,
 'holder': 85,
 'provided': 140,
 'access': 1,
 'credit': 41,
 'airspace': 7,
 'lounge': 105,
 'jk': 94,
 'southwestair': 159,
 'thank': 169,
 'twitter': 176,
 'say': 151,
 'de': 45,
 'someone': 157,
 'unless': 180,
 'follow': 67,
 'follows': 68,
 'appalled': 16,
 'departure': 47,
 'messed': 112,
 'return': 146,
 'supposed': 165,
 'fly': 66,
 'alone': 10,
 'sick': 155,
 'kid': 95,
 'status': 161,
 'su': 163,
 'm

## TF-IDF

sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_idf_vectorizer = TfidfVectorizer()
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)

# TF-IDF fit'in yaptığı işlem train setindeki unique bütün tokenleri tespit eder (tüm dönüşümler train setindeki unique
# tokenlere göre yapılır)
# TF-IDF transformun yaptığı işlem;
# Her unique tokenin her yorumda kaçar defa geçtiğini tespit eder (Hem traim hemde test seti için ayrı ayrı) TF pay.
# Her yorumun(dokument, row) kaç tokenden oluştuğunu tespit eder (Hem traim hemde test seti için ayrı ayrı) TF payda.
# Her unique tokenin kaç satırda (document) geçtiğini tespit eder (Hem traim hemde test seti için ayrı ayrı) DF pay.
# datanın toplam kaç satırdan oluştuğunu tespit eder (Hem traim hemde test seti için ayrı ayrı)(DF payda) ve formulde yerlerine
# koyup hesaplamasını yapar.

In [None]:
tf_idf_vectorizer.get_feature_names_out()

array(['accept', 'access', 'addition', 'advice', 'ago', 'airline',
       'airport', 'airspace', 'airway', 'alex', 'alone', 'americanair',
       'amp', 'anxiety', 'apology', 'app', 'appalled', 'arrive', 'artman',
       'askamex', 'assistance', 'avail', 'avoid', 'back', 'bag', 'beta',
       'billion', 'board', 'boarding', 'book', 'bump', 'buy', 'call',
       'certificate', 'check', 'childish', 'closed', 'coming',
       'companion', 'create', 'creates', 'credit', 'crew', 'customer',
       'day', 'de', 'dedicated', 'departure', 'deserve', 'door', 'dumb',
       'earlier', 'eat', 'equivalent', 'europe', 'even', 'ever', 'every',
       'exist', 'failover', 'far', 'finally', 'find', 'flight', 'flip',
       'flt', 'fly', 'follow', 'follows', 'forced', 'fortunemagazine',
       'frustrated', 'fuel', 'fw', 'ga', 'gate', 'get', 'glad', 'go',
       'going', 'great', 'ground', 'group', 'haul', 'hold', 'holder',
       'inbound', 'instagram', 'ipad', 'iphone', 'irt', 'james',
       'jamesw

In [None]:
X_train_tf_idf.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.65380704, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.36770261, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.25517577]])

In [None]:
df_train_tfidf = pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out(),
                              index= X_train.index)
df_train_tfidf.head()


Unnamed: 0,accept,access,addition,advice,ago,airline,airport,airspace,airway,alex,alone,americanair,amp,anxiety,apology,app,appalled,arrive,artman,askamex,assistance,avail,avoid,back,bag,...,taken,telephone,thank,thanks,think,thought,told,travel,tweet,twitter,two,tx,united,unless,usairsucks,verification,virginamerica,wait,way,wd,well,working,worst,would,yet
11214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.219545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13881,0.0,0.0,0.0,0.0,0.0,0.326904,0.0,0.0,0.0,0.0,0.0,0.195466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.326904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.653807,0.0,0.0
14081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.304062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12994,0.0,0.0,0.0,0.462002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.462002,0.0
2160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.498208,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.405894,0.0,0.376175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X_train.iloc[6]

'southwestair thank twitter say de someone unless follow southwestair follows twitter'

In [None]:
df_train_tfidf.iloc[16].sort_values(ascending=False)

# TF-IDF textimizde sıklıkla geçen tokenlere ait TF-IDF katsayıları minimize ederek tokenleri önemsizleştirir.
# TF-IDF ile dönüştürdüğümüz bir text ML ve DL modeline verildiğinde model öncelikle yüksek katsayıya yoğunlaşır.

fortunemagazine    0.559793
news               0.559793
great              0.499118
southwestair       0.352343
place              0.000000
                     ...   
flt                0.000000
fly                0.000000
follow             0.000000
follows            0.000000
yet                0.000000
Name: 5463, Length: 192, dtype: float64

In [None]:
df_test_tfidf = pd.DataFrame(X_test_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out(), index = X_test.index)
df_test_tfidf.head()

Unnamed: 0,accept,access,addition,advice,ago,airline,airport,airspace,airway,alex,alone,americanair,amp,anxiety,apology,app,appalled,arrive,artman,askamex,assistance,avail,avoid,back,bag,...,taken,telephone,thank,thanks,think,thought,told,travel,tweet,twitter,two,tx,united,unless,usairsucks,verification,virginamerica,wait,way,wd,well,working,worst,would,yet
11136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210841,0.0,0.0,0.210841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.598152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.707695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X_test.iloc[3]

'okay jetblue fleet fleet http'