In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv')

In [3]:
test = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/test.csv')

### Label =1 means negative and label=0 means positive 

In [4]:
df.tail(20)

Unnamed: 0,id,label,tweet
31942,31943,0,this week is flying by #humpday - #wednesday...
31943,31944,0,@user modeling photoshoot this friday yay #mo...
31944,31945,0,you're surrounded by people who love you (even...
31945,31946,0,feel like... ðð¶ð #dog #summer #hot #h...
31946,31947,1,@user omfg i'm offended! i'm a mailbox and i'...
31947,31948,1,@user @user you don't have the balls to hashta...
31948,31949,1,"makes you ask yourself, who am i? then am i a..."
31949,31950,0,hear one of my new songs! don't go - katie ell...
31950,31951,0,"@user you can try to 'tail' us to stop, 'butt..."
31951,31952,0,i've just posted a new blog: #secondlife #lone...


In [5]:
test.head() #note that the test data does not have any labels, hence we cannot measure the accuracy.

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [6]:
len(df)
#since this dataset has 31962 examples, we can split it into train and test set, with test size = 10% of data
#so for testing we'll get roughly 3196 examples which are enough

31962

In [7]:
df['label'].value_counts() #we can see that negative counts are very low

0    29720
1     2242
Name: label, dtype: int64

In [8]:
df.drop(['id'],axis=1,inplace=True) #removing this unwanted column

In [9]:
df.dropna(inplace=True) #removing all rows having null values

In [10]:
print("Before removing all non-ASCII characters: ",df.loc[31955,'tweet'])
#we can see that we have a lot of non-ASCII characters in this dataset.
#so first we need to remove all the non ascii characters so that we can perform futher pre-precessing

Before removing all non-ASCII characters:  less than 2 weeks ððð¼ð¹ððµ @user #ibiza#bringiton#mallorca#holidays#summer  


In [11]:
#in this step, we convert all the non-ASCII characters to ASCII characters
df['tweet']=df['tweet'].apply(lambda x : x.encode("ascii","ignore").decode())

In [12]:
print("After removing all non-ASCII characters: ",df.loc[31955,'tweet'])

After removing all non-ASCII characters:  less than 2 weeks  @user #ibiza#bringiton#mallorca#holidays#summer  


In [13]:
#Now we need to perform futher pre-processing

In [14]:
#converting all uppercase characters to lowercase so that our model does not interprete 'Dog' and 'dog' as two different characters

In [15]:
df['tweet']=df['tweet'].apply(lambda x: x.lower())

In [16]:
#using regular expressions, we'll remove all the characters which are not: "-,.#@ a-z A-Z 0-9"

In [17]:
import re

In [18]:
df['tweet']=df['tweet'].apply(lambda x: re.sub(r'[^\d\w#@-_,. ]+','',x))

In [19]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer

unable to import 'smart_open.gcs', disabling that module


In [20]:
stemmer=SnowballStemmer("english")

def lemmatize_(word):
    return stemmer.stem(WordNetLemmatizer().lemmatize(word,pos='v'))

def preprocess(text):
    result=[]
    tokens=simple_preprocess(text)
    for token in tokens:
        if(token not in STOPWORDS and len(token)>3):
            result.append(lemmatize_(token))
            
    return ' '.join(result)

In [21]:
preprocess('I was swimming yesterday with my friends')

'swim yesterday friend'

In [22]:
df['tweet']=df['tweet'].apply(preprocess)

In [23]:
df.head()

Unnamed: 0,label,tweet
0,0,user father dysfunct selfish drag kid dysfunct
1,0,user user thank lyft credit caus dont offer wh...
2,0,bihday majesti
3,0,model love time
4,0,factsguid societi motiv


In [24]:
#Now all the pre-processing is done, so we can move forward to build the model

In [25]:
x=df['tweet'].values

In [26]:
x.shape

(31962,)

In [27]:
y=df['label'].values

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
# from sklearn.feature_extraction.text import CountVectorizer

In [30]:
# vector=CountVectorizer(min_df=5, ngram_range=(1,2)).fit(x)

In [31]:
vector=TfidfVectorizer(min_df=5, ngram_range=(1,2)).fit(x)

In [50]:
vector.get_feature_names()

['aamp',
 'abandon',
 'abil',
 'abl',
 'abl rememb',
 'ableism',
 'abrahamhick',
 'abrahamhick lawofattract',
 'absolut',
 'abund',
 'abus',
 'academi',
 'accent',
 'accept',
 'accept role',
 'accept stoprac',
 'accept user',
 'access',
 'accessori',
 'accid',
 'accomplish',
 'accord',
 'account',
 'account user',
 'accus',
 'ace',
 'ach',
 'achiev',
 'acknowledg',
 'acn',
 'acn altwaystoh',
 'acoust',
 'acquir',
 'act',
 'act like',
 'action',
 'activ',
 'activist',
 'actor',
 'actor seeklearn',
 'actorslif',
 'actress',
 'actual',
 'actual believ',
 'adam',
 'adapt',
 'adapt environ',
 'add',
 'addict',
 'addit',
 'address',
 'adel',
 'adida',
 'admin',
 'administr',
 'admir',
 'admit',
 'adopt',
 'ador',
 'adrenalin',
 'adult',
 'adulteri',
 'advanc',
 'advantag',
 'adveis',
 'adventur',
 'adventur time',
 'advic',
 'advis',
 'advoc',
 'affair',
 'affect',
 'affect orlando',
 'affirm',
 'affirm motiv',
 'afford',
 'afghan',
 'afraid',
 'africa',
 'african',
 'african american',
 'af

In [47]:
len(vector.idf_)

7089

In [48]:
len(vector.vocabulary_)

7089

In [49]:
vector.idf_[3847]

5.254237064767768

In [45]:
vector.vocabulary_

{'user': 6201,
 'father': 1847,
 'selfish': 5107,
 'drag': 1548,
 'kid': 3126,
 'user father': 6317,
 'thank': 5801,
 'lyft': 3599,
 'credit': 1201,
 'caus': 834,
 'dont': 1507,
 'offer': 4100,
 'user user': 6598,
 'user thank': 6568,
 'bihday': 509,
 'majesti': 3620,
 'bihday majesti': 522,
 'model': 3808,
 'love': 3484,
 'time': 5931,
 'model love': 3809,
 'love time': 3557,
 'factsguid': 1787,
 'societi': 5368,
 'motiv': 3862,
 'huge': 2807,
 'fare': 1829,
 'talk': 5717,
 'leav': 3249,
 'chao': 874,
 'camp': 778,
 'tomorrow': 5989,
 'danni': 1292,
 'user camp': 6249,
 'tomorrow user': 5993,
 'school': 5042,
 'year': 7035,
 'exam': 1733,
 'think': 5881,
 'hate': 2601,
 'imagin': 2855,
 'actorslif': 40,
 'girl': 2244,
 'year year': 7043,
 'land': 3187,
 'cav': 835,
 'champion': 863,
 'cleveland': 968,
 'welcom': 6867,
 'user welcom': 6613,
 'ireland': 3005,
 'consum': 1119,
 'price': 4576,
 'index': 2882,
 'climb': 976,
 'previous': 4572,
 'blog': 587,
 'silver': 5237,
 'gold': 2317,


In [32]:
x_tfidf=vector.transform(x)

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
model=LogisticRegression().fit(x_tfidf,y)

In [42]:
model.coef_[0]

array([ 1.41026351, -0.13717921, -0.1787412 , ...,  1.999898  ,
       -0.2143949 , -0.16723058])

In [52]:
len(model.coef_[0])

7089

In [35]:
# get the feature names as numpy array
feature_names = np.array(vector.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Positive words:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Negative words: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Positive words:
['happi' 'love' 'life' 'bihday' 'orlando' 'today' 'smile' 'tomorrow'
 'weekend' 'healthi']

Negative words: 
['white' 'racism' 'allahsoil' 'racist' 'user allahsoil' 'bigot' 'trump'
 'black' 'women' 'altright']


In [36]:
input1='The weather is very good today'

In [37]:
input2="Donald Trump is a racist"

In [38]:
model.predict(vector.transform([preprocess(input1)]))

array([0], dtype=int64)

In [39]:
model.predict(vector.transform([preprocess(input2)]))

array([1], dtype=int64)

In [40]:
input3="all Americans are black and useless"

In [41]:
model.predict(vector.transform([preprocess(input3)]))

array([1], dtype=int64)