In [24]:
import re
import string
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [53]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ammar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ammar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ammar\AppData\Roaming\nltk_data...


True

In [64]:
df = pd.read_csv(r'./tweets.csv')
df.head()

Unnamed: 0,Tweet,Avg
0,"I have to say, Apple has by far the best custo...",2.0
1,iOS 7 is so fricking smooth & beautiful!! #Tha...,2.0
2,LOVE U @APPLE,1.8
3,"Thank you @apple, loving my new iPhone 5S!!!!!...",1.8
4,.@apple has the best customer service. In and ...,1.8


In [65]:
df = df['Tweet']
df = df.astype('str')
df.shape

(1181,)

In [66]:
df.head()

0    I have to say, Apple has by far the best custo...
1    iOS 7 is so fricking smooth & beautiful!! #Tha...
2                                        LOVE U @APPLE
3    Thank you @apple, loving my new iPhone 5S!!!!!...
4    .@apple has the best customer service. In and ...
Name: Tweet, dtype: object

In [76]:
# For Removing Punctuation Symbols and Stop Words From Tweets
translate_table = dict((ord(char), None) for char in string.punctuation)
stop_words = set(stopwords.words('english'))

In [77]:
# Adding custom stopwords
new_stop_words = [
    'some', 'like', 'think', 'wow', 'one', 'http', 'web', 'really', 
    'see', 'watch', 'apple', 'know', 'show', 'think', 'click', 'go', 'to', 'great', 
    'very', 'good', 'many', 'more', 'people', 'made', 'technology', 'tech',
    'iphone', 'ipad', 'new', 'latest', 'phone', 'itunes', 'brand', 'ipod', 'iphones', 
    'io', 'get', 'buy', 'purchase', 'make', 'im', "iam", 'dont', 'cant', 'promoipodplayerpromo',
    'ipodplayerpromo', 'player', 'itune']
stop_words = stop_words.union(new_stop_words)

In [78]:
# Text Pre-Processing
corpus = []
for i, line in df.iteritems():
    line = line.lower()
    line = re.sub(r"\d+", "", line)
    line = line.translate(translate_table)
    line = line.split()
    
    # Lemmatizers reduces each word to its root/canonical form
    lm = WordNetLemmatizer()
    line = [lm.lemmatize(word) for word in line if not word in stop_words]
    line = " ".join(line)
    corpus.append(line)

corpus[1100]

'teaser trailer macbook pro freak'

In [79]:
from sklearn.feature_extraction.text import CountVectorizer 

In [86]:
# We ignore the words that appear in 70% of documents in the corpus
def get_top_keywords(corpus, upto=None):    
    cv = CountVectorizer(max_df=0.7,
                        stop_words=stop_words,
                        ngram_range=(2,3),
                        min_df=0.001)
    X = cv.fit_transform(corpus)
    bag_of_words = cv.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x : x[1], reverse=True)
    return words_freq[:upto]

In [87]:
top_words = get_top_keywords(corpus, upto=70)
top_words_df = pd.DataFrame(top_words)
top_words_df.columns = ['Word', 'Frequency']

In [90]:
top_words_df.head(10)

Unnamed: 0,Word,Frequency
0,facebook amazon,11
1,card app,9
2,fingerprint scanner,8
3,steve job,7
4,vp marketing,7
5,marketing quiet,7
6,quiet photogs,7
7,photogs let,7
8,let photography,7
9,condescension anyone,7


In [99]:
# Another way to extract keywords (Using TF-IDF Vectorizer)
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 2), min_df=0.001)
tf_vocab = tf_idf.fit_transform(corpus)
tf_vocab = tf_idf.transform(corpus)

In [97]:
feature_names = tf_idf.get_feature_names()
feature_names[:10]

['aapl',
 'absolutely',
 'abt',
 'access',
 'accidentally',
 'acciones',
 'acciones de',
 'account',
 'acting',
 'activist']

In [105]:
sum_words = tf_vocab.sum(axis=0)
words_freq = [(word, sum_words[0, idx])
              for word, idx in tf_idf.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
words_freq[:30]

[('promo', 18.90847376192999),
 ('freak', 17.17697860023629),
 ('store', 14.155237274781651),
 ('iphonec', 14.09608028645135),
 ('need', 12.001938748005974),
 ('time', 10.586806086811908),
 ('fingerprint', 10.454660578368726),
 ('thanks', 10.349970563467519),
 ('want', 9.713939612056656),
 ('de', 9.687285268615476),
 ('app', 9.422630904822924),
 ('would', 9.234892026321225),
 ('rt', 9.234569335351923),
 ('come', 9.08323098622442),
 ('hate', 9.027584016005337),
 ('look', 8.71127368495836),
 ('lol', 8.395419577608036),
 ('love', 8.290610155691734),
 ('better', 8.1330285471406),
 ('samsung', 7.994020693427633),
 ('generation', 7.812109373221487),
 ('android', 7.691626364749863),
 ('google', 7.644589459067082),
 ('please', 7.438880601565536),
 ('twitter', 7.349086435926864),
 ('battery', 7.256338447322632),
 ('guy', 7.237185640858383),
 ('microsoft', 7.154458874644534),
 ('give', 7.0109480231317),
 ('back', 7.008423730942729)]