# Data Cleaning and Vectorization For NLP

## Install and Import

In [11]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)

In [12]:
!pip install nltk



## Tokenization

In [13]:
import nltk

In [14]:
sample_text= "Oh man, this is pretty cool. We will do more such things. @mynet"

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [16]:
sentence_token = sent_tokenize(sample_text.lower())
sentence_token

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/yavuz/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
word_token = word_tokenize(sample_text.lower())
word_token

## Removing Punctuation and Numbers

In [None]:
tokens_without_punc = [w for w in word_token if w.isalpha()] # .isalnum() for number and object
tokens_without_punc

## Removing Stopwords

In [None]:
#nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words("english")
stop_words

In [None]:
tokens_without_punc

In [None]:
token_without_sw = [t for t in tokens_without_punc if t not in stop_words] # if you make a sentiment analysis , you can remove 
                                                                           # negative auxiliary verb
token_without_sw

## Data Normalization-Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
#nltk.download('wordnet')

In [None]:
WordNetLemmatizer().lemmatize("children")

In [None]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]

In [None]:
lem

## Data Normalization-Stemming

In [None]:
from nltk.stem import PorterStemmer

In [None]:
PorterStemmer().stem("driving")

In [None]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [None]:
stem

## Joining

In [None]:
" ".join(lem)

## Cleaning Function - for classification (NOT for sentiment analysis)

In [None]:
def cleaning(data):
    
    #1. Tokenize
    text_tokens = word_tokenize(data.lower()) 
    
    #2. Remove Puncs
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [None]:
pd.Series(sample_text).apply(cleaning)

## Cleaning Function - for sentiment analysis

In [None]:
sample_text= "Oh man, this is pretty cool. We will do more such things. don't aren't are not. no problem"

In [None]:
s = sample_text.replace("'",'')
word = word_tokenize(s)
word 

In [None]:
def cleaning_fsa(data):
    
    
    #1. removing upper brackets to keep negative auxiliary verbs in text
    text = data.replace("'",'')
         
    #2. Tokenize
    text_tokens = word_tokenize(text.lower()) 
    
    #3. Remove numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #4. Removing Stopwords
    for i in ["not", "no"]:
        stop_words.remove(i)
        
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #5. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [None]:
stop_words

In [None]:
pd.Series(sample_text).apply(cleaning_fsa)

## CountVectorization and TF-IDF Vectorization

In [None]:
df = pd.read_csv("airline_tweets.csv")

In [None]:
df.head()

In [None]:
df = df[['airline_sentiment','text']]
df

In [None]:
df = df.iloc[:8, :]
df

In [None]:
df2 = df.copy()

In [None]:
df2["text"] = df2["text"].apply(cleaning)

In [None]:
df2

## CountVectorization

In [None]:
X = df2["text"] 
y = df2["airline_sentiment"]  #tek tek unique olan kelimeleri bulup hepsini vectörize edicek 

In [None]:
from sklearn.model_selection import train_test_split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, stratify = y, random_state = 42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)#xtrain büyük olmalı 
X_test_count = vectorizer.transform(X_test) #dönüştürdük sayısal verilere 

In [None]:
vectorizer.get_feature_names() 

In [None]:
X_train_count.toarray()

In [136]:
df_count = pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names())
df_count

Unnamed: 0,another,away,bad,big,dhepburn,ear,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,worm,yes
0,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,1,1
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,1,1,1,0,0,0
3,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0


In [None]:
df_count = pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names_out())
df_count

In [138]:
X_train

6    virginamerica yes nearly every time fly vx ear...
0                          virginamerica dhepburn said
2    virginamerica today must mean need take anothe...
4                   virginamerica really big bad thing
Name: text, dtype: object

In [133]:
X_train[6]

'virginamerica yes nearly every time fly vx ear worm go away'

In [134]:
vectorizer.vocabulary_

{'virginamerica': 20,
 'yes': 23,
 'nearly': 11,
 'every': 6,
 'time': 17,
 'fly': 7,
 'vx': 21,
 'ear': 5,
 'worm': 22,
 'go': 8,
 'away': 1,
 'dhepburn': 4,
 'said': 14,
 'today': 18,
 'must': 10,
 'mean': 9,
 'need': 12,
 'take': 15,
 'another': 0,
 'trip': 19,
 'really': 13,
 'big': 3,
 'bad': 2,
 'thing': 16}

## TF-IDF

sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [104]:
tf_idf_vectorizer = TfidfVectorizer()
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
#kaç adet geçiyor 
#ilgili token kaç defa ageçiyor 
#TF : her cümlede kaç defa geçiyor 
#corpus 100 satır olsun 10 tanesinde ahmet geçiyor 
#X_test de geçen kelime X_trainde geçmiyorsa 

In [105]:
tf_idf_vectorizer.get_feature_names()

['another',
 'away',
 'bad',
 'big',
 'dhepburn',
 'ear',
 'every',
 'fly',
 'go',
 'mean',
 'must',
 'nearly',
 'need',
 'really',
 'said',
 'take',
 'thing',
 'time',
 'today',
 'trip',
 'virginamerica',
 'vx',
 'worm',
 'yes']

In [106]:
X_train_tf_idf.toarray()

array([[0.        , 0.31200802, 0.        , 0.        , 0.        ,
        0.31200802, 0.31200802, 0.31200802, 0.31200802, 0.        ,
        0.        , 0.31200802, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.31200802, 0.        , 0.        ,
        0.16281873, 0.31200802, 0.31200802, 0.31200802],
       [0.        , 0.        , 0.        , 0.        , 0.66338461,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.66338461,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.34618161, 0.        , 0.        , 0.        ],
       [0.37082034, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.37082034,
        0.37082034, 0.        , 0.37082034, 0.        , 0.        ,
        0.37082034, 0.        , 0.        , 0.37082034, 0.37082034,
        0.19350944, 0.        , 0.        , 0.        ],
       [0.   

In [140]:
df_tfidf = pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names())
df_tfidf
#yeni featurelarımız oluyor 

Unnamed: 0,another,away,bad,big,dhepburn,ear,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,worm,yes
0,0.0,0.312008,0.0,0.0,0.0,0.312008,0.312008,0.312008,0.312008,0.0,0.0,0.312008,0.0,0.0,0.0,0.0,0.0,0.312008,0.0,0.0,0.162819,0.312008,0.312008,0.312008
1,0.0,0.0,0.0,0.0,0.663385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.663385,0.0,0.0,0.0,0.0,0.0,0.346182,0.0,0.0,0.0
2,0.37082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37082,0.37082,0.0,0.37082,0.0,0.0,0.37082,0.0,0.0,0.37082,0.37082,0.193509,0.0,0.0,0.0
3,0.0,0.0,0.483803,0.483803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483803,0.0,0.0,0.483803,0.0,0.0,0.0,0.252468,0.0,0.0,0.0


In [141]:
X_train[6] #virginica her satırda geçtiği için virginicamerika azaltmış o yüzden ben bu kelime ile classification yapamam 

'virginamerica yes nearly every time fly vx ear worm go away'

In [142]:
df_tfidf.loc[2].sort_values(ascending=False) #en düşük virginica amerika 

another          0.370820
mean             0.370820
trip             0.370820
today            0.370820
take             0.370820
must             0.370820
need             0.370820
virginamerica    0.193509
fly              0.000000
thing            0.000000
worm             0.000000
vx               0.000000
bad              0.000000
big              0.000000
time             0.000000
dhepburn         0.000000
go               0.000000
said             0.000000
really           0.000000
away             0.000000
nearly           0.000000
ear              0.000000
every            0.000000
yes              0.000000
Name: 2, dtype: float64

In [None]:
pd.DataFrame(X_test_tf_idf.toarray(),columns=tf_idf_vectorizer.get_feature_names())

In [None]:
X_test[3]