In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk import word_tokenize
%matplotlib inline

In [4]:
# reading data
bow = pd.read_csv('./bagofword.txt',delimiter=';',
                  header=None,names=['Sentence','category'])
bow.head()

Unnamed: 0,Sentence,category
0,Cricket is a game of gentlemen,1
1,Soccer Worldcup is round the corner,1
2,Sports evokes patriotism,1
3,Viewership of Badminton is increasing in India,1
4,Indian boxers won silver medal in CommonWelth ...,1


In [5]:
from nltk.corpus import stopwords

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srikanth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Cleaning text
- Lower Case
- Remove special characters
- Stemming
- Lemma
- Remove Stopwords

In [10]:
stop = stopwords.words('english')
#print(stop)

Lower case:

In [11]:
text = 'Viewership of Badminton is increasing in India'
# lower
text = text.lower()
text

'viewership of badminton is increasing in india'

In [12]:
from nltk.stem import PorterStemmer,RegexpStemmer

In [13]:
ps = PorterStemmer()
ps.stem('increasing')

'increas'

In [18]:
r = RegexpStemmer('ing$|es$')

In [19]:
r.stem('increases')

'increas'

In [53]:
# tokenize  and stemming
tokens = word_tokenize(text)
print(tokens)
words = []
for word in tokens:
    if word not in stop:
        #print(word)
        words.append(ps.stem(word))

ctext = " ".join(words)
print(ctext)

['viewership', 'of', 'badminton', 'is', 'increasing', 'in', 'india']
viewership badminton increas india


In [30]:
# Lemmatization
from nltk import WordNetLemmatizer

In [31]:
lemma = WordNetLemmatizer()

In [47]:
lemma.lemmatize('likes')

'like'

In [52]:
# stopwords
tokens = word_tokenize(ctext)
print(tokens)
for word in tokens:
    if word not in stop:
        print(word)

['viewership', 'of', 'badminton', 'is', 'increas', 'in', 'india']
viewership
badminton
increas
india


In [54]:
import re

In [64]:
# For 
from nltk.stem import PorterStemmer
def cleantext(sentence):
    ps = PorterStemmer()
    # lowercase
    text = sentence.lower()
    # Remove special characters and number 
    text = re.sub(r'[^a-z]',' ',text)
    # tokenize
    token = word_tokenize(text)
    # remove stop and apply stemming
    words = []
    for word in token:
        if word not in stop: # removing stop words
            words.append(ps.stem(word)) # applying stemming
    
    return " ".join(words) # words into sentence

In [68]:
cleantext('Viewership of Badminton is increasing in India')

'viewership badminton increas india'

In [67]:
bow['Clean_Sentence'] = bow['Sentence'].apply(cleantext)

In [69]:
bow.head()

Unnamed: 0,Sentence,category,Clean_Sentence
0,Cricket is a game of gentlemen,1,cricket game gentlemen
1,Soccer Worldcup is round the corner,1,soccer worldcup round corner
2,Sports evokes patriotism,1,sport evok patriot
3,Viewership of Badminton is increasing in India,1,viewership badminton increas india
4,Indian boxers won silver medal in CommonWelth ...,1,indian boxer silver medal commonwelth game


## Word Embedding
- Bag of words

In [74]:
sentlist = list(bow['Clean_Sentence'])

In [76]:
from sklearn.feature_extraction.text import CountVectorizer # Bag of Word
cv = CountVectorizer()
cv.fit(sentlist)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [84]:
print(cv.get_feature_names())

['across', 'adil', 'arnold', 'arsen', 'australia', 'away', 'awe', 'badminton', 'bag', 'barcelona', 'batswoman', 'beat', 'bengal', 'bernabeu', 'border', 'boxer', 'came', 'carlo', 'caus', 'championship', 'charg', 'chase', 'church', 'citi', 'closer', 'coldest', 'commonwelth', 'consol', 'convert', 'corner', 'cricket', 'da', 'damag', 'dead', 'derbi', 'destroy', 'dhoni', 'dismiss', 'draw', 'drive', 'driver', 'earthquak', 'ecuador', 'edg', 'end', 'enter', 'evok', 'expect', 'experi', 'fanci', 'far', 'fc', 'feder', 'feel', 'ferrari', 'fifti', 'fight', 'five', 'flair', 'flood', 'florida', 'four', 'game', 'gentlemen', 'give', 'goal', 'gold', 'gp', 'hamilton', 'heavi', 'highlight', 'hima', 'hindi', 'hit', 'hotter', 'hum', 'ian', 'increas', 'india', 'indian', 'inexplic', 'isl', 'island', 'issoko', 'jadhav', 'kati', 'kedar', 'kerala', 'key', 'khan', 'kohli', 'kolkata', 'la', 'last', 'late', 'lead', 'leagu', 'leav', 'level', 'lewi', 'liga', 'llori', 'madrid', 'magnitud', 'maharashtra', 'major', 'mars

In [80]:
X = cv.transform(sentlist).toarray()

In [82]:
X.shape
# Sparse Matrix

(30, 191)

In [86]:
# convert into dataframe
df = pd.DataFrame(X,columns=cv.get_feature_names())
df.to_csv('bow_sport_weather.csv')

In [87]:
df['class'] = bow['category']
df.head()

Unnamed: 0,across,adil,arnold,arsen,australia,away,awe,badminton,bag,barcelona,...,way,weather,wembley,win,winter,world,worldcup,year,zealand,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [98]:
sports = list(df[df['class']==1].keys())
weather = list(df[df['class']==0].keys())

In [108]:
df[df['class']==1]!=0

Unnamed: 0,across,adil,arnold,arsen,australia,away,awe,badminton,bag,barcelona,...,way,weather,wembley,win,winter,world,worldcup,year,zealand,class
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
10,False,False,False,False,False,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,True
11,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
12,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
13,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
14,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
