## Install and Import

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# !pip install nltk

## Tokenization

In [3]:
# import nltk
# nltk.download('punkt')

In [4]:
import nltk

In [5]:
sample_text= "Oh man, this is pretty cool. We will do more such things."

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [7]:
sentence_token = sent_tokenize(sample_text.lower())
sentence_token

['oh man, this is pretty cool.',
 'we will do more such things.',
 "don't, isn't, aren't, couldn't"]

In [8]:
word_token = word_tokenize(sample_text.lower())
word_token

['oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 'do',
 "n't",
 ',',
 'is',
 "n't",
 ',',
 'are',
 "n't",
 ',',
 'could',
 "n't"]

## Removing Punctuation and Numbers

In [66]:
tokens_without_punc = [w for w in word_token if w.isalpha()] # .isalnum() for number and object
tokens_without_punc

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 'do',
 'is',
 'are',
 'could']

## Removing Stopwords

In [10]:
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bunyaminkeles\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
from nltk.corpus import stopwords

In [69]:
stop_words = stopwords.words("turkish")
stop_words

['acaba',
 'ama',
 'aslında',
 'az',
 'bazı',
 'belki',
 'biri',
 'birkaç',
 'birşey',
 'biz',
 'bu',
 'çok',
 'çünkü',
 'da',
 'daha',
 'de',
 'defa',
 'diye',
 'eğer',
 'en',
 'gibi',
 'hem',
 'hep',
 'hepsi',
 'her',
 'hiç',
 'için',
 'ile',
 'ise',
 'kez',
 'ki',
 'kim',
 'mı',
 'mu',
 'mü',
 'nasıl',
 'ne',
 'neden',
 'nerde',
 'nerede',
 'nereye',
 'niçin',
 'niye',
 'o',
 'sanki',
 'şey',
 'siz',
 'şu',
 'tüm',
 've',
 'veya',
 'ya',
 'yani']

In [73]:
stop_words = stopwords.words("english")
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [74]:
tokens_without_punc

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 'do',
 'is',
 'are',
 'could']

In [75]:
token_without_sw = [t for t in tokens_without_punc if t not in stop_words] # if you make a sentiment analysis , you can remove 
                                                                          # negative auxiliary verb
token_without_sw

['oh', 'man', 'pretty', 'cool', 'things', 'could']

In [76]:
# [i for i in stop_words if "n't" in i]

## Lemmatization

In [16]:
from nltk.stem import WordNetLemmatizer

In [17]:
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bunyaminkeles\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [96]:
WordNetLemmatizer().lemmatize("driven")

'driven'

In [80]:
WordNetLemmatizer().lemmatize("children")

'child'

In [153]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]

In [82]:
lem

['oh', 'man', 'pretty', 'cool', 'thing', 'could']

## Stemming

In [21]:
from nltk.stem import PorterStemmer

In [83]:
PorterStemmer().stem("driving")

'drive'

In [84]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [85]:
stem

['oh', 'man', 'pretti', 'cool', 'thing', 'could']

## Joining

In [86]:
" ".join(lem)

'oh man pretty cool thing could'

## Cleaning Function - NOT for sentiment analysis

In [26]:
import pandas as pd

In [154]:
def cleaning(data):
    
   text_tokens = word_tokenize(data.lower())  #1. Tokenize
    
   tokens_without_punc = [w for w in text_tokens if w.isalpha()] #2. Remove Puncs
   
   tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words] #3. Removing Stopwords
    
   text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw] #4. lemma
    
   return " ".join(text_cleaned) #joining

In [155]:
pd.Series(sample_text).apply(cleaning)

0    oh man pretty cool thing could
dtype: object

## Cleaning Function - for sentiment analysis

In [105]:
sample_text

"Oh man, this is pretty cool. We will do more such things. don't, isn't, aren't, couldn't"

In [106]:
import re
# \w typically matches [A-Za-z0-9_]
s = re.sub('[^\w\s]','',sample_text) # ^ işareti diyor ki: metindeki A-Z,a-z, 0-9 olmayanları at!!
word = word_tokenize(s)
word 

['Oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'We',
 'will',
 'do',
 'more',
 'such',
 'things',
 'dont',
 'isnt',
 'arent',
 'couldnt']

In [156]:
def cleaning_fsa(data):
    
    import re
    #1. Remove Puncs
    # \w typically matches [A-Za-z0-9_]
    text = re.sub('[^\w\s]','',sample_text)
         
    #2. Tokenize
    text_tokens = word_tokenize(text.lower()) 
    
    #3. Remove numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #4. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #5. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [112]:
pd.Series(sample_text).apply(cleaning_fsa)

0    oh man pretty cool thing dont isnt arent couldnt
dtype: object

## Part of Speech Tagging

In [32]:
from nltk import pos_tag

In [33]:
text = "Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT. "

In [34]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bunyaminkeles\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [157]:
tokens = word_tokenize(text)

In [158]:
tagg = pos_tag(tokens)
tagg

[('Oh', 'UH'),
 ('man', 'NN'),
 ('this', 'DT'),
 ('is', 'VBZ'),
 ('pretty', 'JJ'),
 ('cool', 'NN'),
 ('We', 'PRP'),
 ('will', 'MD'),
 ('do', 'VB'),
 ('more', 'RBR'),
 ('such', 'JJ'),
 ('things', 'NNS'),
 ('dont', 'VBP'),
 ('isnt', 'JJ'),
 ('arent', 'NN'),
 ('couldnt', 'NN')]

## Named Entity Recognition

In [37]:
# from nltk import ne_chunk

In [167]:
import nltk

In [168]:
ner = ne_chunk(tagg)
ner

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [('Oh', 'UH'), ('man', 'NN'), ('this', 'DT'), ('is', 'VBZ'), ('pretty', 'JJ'), ('cool', 'NN'), ('We', 'PRP'), ('will', 'MD'), ('do', 'VB'), ('more', 'RBR'), ('such', 'JJ'), ('things', 'NNS'), ('dont', 'VBP'), ('isnt', 'JJ'), ('arent', 'NN'), ('couldnt', 'NN')])

In [39]:
ner.draw()

## CountVectorization and TF-IDF Vectorization

In [113]:
df = pd.read_csv("airline_tweets.csv")

In [114]:
df.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)


In [115]:
df = df[['airline_sentiment','text']]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [116]:
import pandas as pd
import numpy as np

In [117]:
df = df.iloc[:8, :]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...


In [118]:
df2 = df.copy()

In [119]:
df2["text"] = df2["text"].apply(cleaning)

In [120]:
df2

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn said
1,positive,virginamerica plus added commercial experience...
2,neutral,virginamerica today must mean need take anothe...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing
5,negative,virginamerica seriously would pay flight seat ...
6,positive,virginamerica yes nearly every time fly vx ear...
7,neutral,virginamerica really missed prime opportunity ...


## CountVectorization

In [121]:
X = df2["text"]
y = df2["airline_sentiment"]

In [122]:
from sklearn.model_selection import train_test_split

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, stratify = y, random_state = 42)

In [124]:
from sklearn.feature_extraction.text import CountVectorizer

In [125]:
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

In [126]:
vectorizer.get_feature_names()

['another',
 'away',
 'bad',
 'big',
 'dhepburn',
 'ear',
 'every',
 'fly',
 'go',
 'mean',
 'must',
 'nearly',
 'need',
 'really',
 'said',
 'take',
 'thing',
 'time',
 'today',
 'trip',
 'virginamerica',
 'vx',
 'worm',
 'yes']

In [127]:
X_train_count.toarray()

array([[0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
        1, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
        0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0]], dtype=int64)

In [128]:
pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,another,away,bad,big,dhepburn,ear,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,worm,yes
0,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,1,1
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,1,1,1,0,0,0
3,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0


In [129]:
pd.set_option('display.max_columns', 50)

In [130]:
X_train[6]

'virginamerica yes nearly every time fly vx ear worm go away'

In [58]:
vectorizer.vocabulary_

{'virginamerica': 20,
 'yes': 23,
 'nearly': 11,
 'every': 6,
 'time': 17,
 'fly': 7,
 'vx': 21,
 'ear': 5,
 'worm': 22,
 'go': 8,
 'away': 1,
 'dhepburn': 4,
 'said': 14,
 'today': 18,
 'must': 10,
 'mean': 9,
 'need': 12,
 'take': 15,
 'another': 0,
 'trip': 19,
 'really': 13,
 'big': 3,
 'bad': 2,
 'thing': 16}

## TF-IDF

sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [131]:
tf_idf_vectorizer = TfidfVectorizer()
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)

In [132]:
tf_idf_vectorizer.get_feature_names()

['another',
 'away',
 'bad',
 'big',
 'dhepburn',
 'ear',
 'every',
 'fly',
 'go',
 'mean',
 'must',
 'nearly',
 'need',
 'really',
 'said',
 'take',
 'thing',
 'time',
 'today',
 'trip',
 'virginamerica',
 'vx',
 'worm',
 'yes']

In [133]:
X_train_tf_idf.toarray()

array([[0.        , 0.31200802, 0.        , 0.        , 0.        ,
        0.31200802, 0.31200802, 0.31200802, 0.31200802, 0.        ,
        0.        , 0.31200802, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.31200802, 0.        , 0.        ,
        0.16281873, 0.31200802, 0.31200802, 0.31200802],
       [0.        , 0.        , 0.        , 0.        , 0.66338461,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.66338461,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.34618161, 0.        , 0.        , 0.        ],
       [0.37082034, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.37082034,
        0.37082034, 0.        , 0.37082034, 0.        , 0.        ,
        0.37082034, 0.        , 0.        , 0.37082034, 0.37082034,
        0.19350944, 0.        , 0.        , 0.        ],
       [0.   

In [134]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,another,away,bad,big,dhepburn,ear,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,worm,yes
0,0.0,0.312008,0.0,0.0,0.0,0.312008,0.312008,0.312008,0.312008,0.0,0.0,0.312008,0.0,0.0,0.0,0.0,0.0,0.312008,0.0,0.0,0.162819,0.312008,0.312008,0.312008
1,0.0,0.0,0.0,0.0,0.663385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.663385,0.0,0.0,0.0,0.0,0.0,0.346182,0.0,0.0,0.0
2,0.37082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37082,0.37082,0.0,0.37082,0.0,0.0,0.37082,0.0,0.0,0.37082,0.37082,0.193509,0.0,0.0,0.0
3,0.0,0.0,0.483803,0.483803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483803,0.0,0.0,0.483803,0.0,0.0,0.0,0.252468,0.0,0.0,0.0


In [135]:
X_train[6]

'virginamerica yes nearly every time fly vx ear worm go away'

In [137]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names()).loc[2].sort_values(ascending=False)

another          0.370820
mean             0.370820
trip             0.370820
today            0.370820
take             0.370820
must             0.370820
need             0.370820
virginamerica    0.193509
fly              0.000000
thing            0.000000
worm             0.000000
vx               0.000000
bad              0.000000
big              0.000000
time             0.000000
dhepburn         0.000000
go               0.000000
said             0.000000
really           0.000000
away             0.000000
nearly           0.000000
ear              0.000000
every            0.000000
yes              0.000000
Name: 2, dtype: float64

In [151]:
### Dark theme for jupyter notebook, no need to run.
!pip install jupyterthemes
!pip install --upgrade jupyterthemes
!jt -l
#emir's choice
!jt -t chesterish -T -N -kl -tf ptsans -tfs 13 -f fira -fs 11 -ofs 10 -nf ptsans -nfs 13 -cursw 2 -cursc r -cellw 88% -vim
from jupyterthemes import jtplot
# choose which theme to inherit plotting style from
# onedork | grade3 | oceans16 | chesterish | monokai | solarizedl | solarizedd
jtplot.style(theme='chesterish', ticks=True,)
sns.set_style("whitegrid")





Available Themes: 
   chesterish
   grade3
   gruvboxd
   gruvboxl
   monokai
   oceans16
   onedork
   solarizedd
   solarizedl


In [146]:
import seaborn as sns
sns.set_style("whitegrid")

In [147]:
sns.set_style("whitegrid")

In [152]:
jtplot.style(theme='chesterish', ticks=True,)