In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv')

In [3]:
test = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/test.csv')

### Label =1 means negative and label=0 means positive 

In [4]:
df.tail(20)

Unnamed: 0,id,label,tweet
31942,31943,0,this week is flying by #humpday - #wednesday...
31943,31944,0,@user modeling photoshoot this friday yay #mo...
31944,31945,0,you're surrounded by people who love you (even...
31945,31946,0,feel like... ðð¶ð #dog #summer #hot #h...
31946,31947,1,@user omfg i'm offended! i'm a mailbox and i'...
31947,31948,1,@user @user you don't have the balls to hashta...
31948,31949,1,"makes you ask yourself, who am i? then am i a..."
31949,31950,0,hear one of my new songs! don't go - katie ell...
31950,31951,0,"@user you can try to 'tail' us to stop, 'butt..."
31951,31952,0,i've just posted a new blog: #secondlife #lone...


In [5]:
test.head() #note that the test data does not have any labels, hence we cannot measure the accuracy.

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [6]:
len(df)
#since this dataset has 31962 examples, we can split it into train and test set, with test size = 10% of data
#so for testing we'll get roughly 3196 examples which are enough

31962

In [7]:
df['label'].value_counts() #we can see that positive counts are very low

0    29720
1     2242
Name: label, dtype: int64

In [8]:
df.drop(['id'],axis=1,inplace=True) #removing this unwanted column

In [9]:
df.dropna(inplace=True) #removing all rows having null values

In [10]:
print("Before removing all non-ASCII characters: ",df.loc[31955,'tweet'])
#we can see that we have a lot of non-ASCII characters in this dataset.
#so first we need to remove all the non ascii characters so that we can perform futher pre-precessing

Before removing all non-ASCII characters:  less than 2 weeks ððð¼ð¹ððµ @user #ibiza#bringiton#mallorca#holidays#summer  


In [11]:
#in this step, we convert all the non-ASCII characters to ASCII characters
df['tweet']=df['tweet'].apply(lambda x : x.encode("ascii","ignore").decode())

In [12]:
print("After removing all non-ASCII characters: ",df.loc[31955,'tweet'])

After removing all non-ASCII characters:  less than 2 weeks  @user #ibiza#bringiton#mallorca#holidays#summer  


In [13]:
#Now we need to perform futher pre-processing

In [14]:
#converting all uppercase characters to lowercase so that our model does not interprete 'Dog' and 'dog' as two different characters

In [15]:
df['tweet']=df['tweet'].apply(lambda x: x.lower())

In [16]:
#using regular expressions, we'll remove all the characters which are not: "-,.#@ a-z A-Z 0-9"

In [17]:
import re

In [18]:
df['tweet']=df['tweet'].apply(lambda x: re.sub(r'[^\d\w#@-_,. ]+','',x))

In [19]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [20]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer



In [21]:
stemmer=SnowballStemmer("english")

def lemmatize_(word):
    return stemmer.stem(WordNetLemmatizer().lemmatize(word,pos='v'))

def preprocess(text):
    result=[]
    tokens=simple_preprocess(text)
    for token in tokens:
        if(token not in STOPWORDS and len(token)>3):
            result.append(lemmatize_(token))
            
    return ' '.join(result)

In [22]:
#preprocess('I was swimming yesterday with my friends')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
df['tweet']=df['tweet'].apply(preprocess)

In [24]:
df.head()

Unnamed: 0,label,tweet
0,0,user father dysfunct selfish drag kid dysfunct
1,0,user user thank lyft credit caus dont offer wh...
2,0,bihday majesti
3,0,model love time
4,0,factsguid societi motiv


In [25]:
#Now all the pre-processing is done, so we can move forward to build the model

In [26]:
x=df['tweet'].values

In [27]:
x.shape

(31962,)

In [28]:
y=df['label'].values

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
# from sklearn.feature_extraction.text import CountVectorizer

In [31]:
# vector=CountVectorizer(min_df=5, ngram_range=(1,2)).fit(x)

In [32]:
vector=TfidfVectorizer(min_df=5, ngram_range=(1,2)).fit(x)

In [33]:
x_tfidf=vector.transform(x)

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
model=LogisticRegression().fit(x_tfidf,y)

In [36]:
# get the feature names as numpy array
feature_names = np.array(vector.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Positive words:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Negative words: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Positive words:
['happi' 'love' 'life' 'bihday' 'orlando' 'today' 'smile' 'tomorrow'
 'weekend' 'healthi']

Negative words: 
['white' 'racism' 'allahsoil' 'racist' 'user allahsoil' 'bigot' 'trump'
 'black' 'women' 'altright']


In [37]:
input1='The weather is very good today'

In [38]:
input2="Donald Trump is a racist"

In [39]:
op=model.predict(vector.transform([preprocess(input1)]))
if(op[0]==1):
    print("negative")
else:
   print("positive")

positive


In [46]:

op2=model.predict(vector.transform([preprocess(input2)]))
if(op2[0]==1):
    print("negative")
else:
   print("positive")

negative


In [2]:
input3="Asians have experinced violence"

In [47]:
op3=model.predict(vector.transform([preprocess(input3)]))
if(op2[0]==1):
    print("negative")
else:
   print("positive")

negative
