In [1]:
import numpy as np
import pandas as pd

## QUESTION 1
# spam detection

In [10]:
df = pd.read_csv("C:/Users/adity/Downloads/spam.csv",encoding='ISO-8859-1')
df = df[['v1', 'v2']]

In [12]:
df.columns = ['label','message']

In [13]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [27]:
import nltk
import regex as re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
stop_words = set(stopwords.words('english'))

In [30]:
def preprocessing(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [31]:
df['message']= df['message'].apply(preprocessing)

In [32]:
df['message']

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkt st m...
3                     u dun say earli hor u c alreadi say
4               nah dont think goe usf live around though
                              ...                        
5567    nd time tri contact u u pound prize claim easi...
5568                                b go esplanad fr home
5569                              piti mood soani suggest
5570    guy bitch act like id interest buy someth el n...
5571                                       rofl true name
Name: message, Length: 5572, dtype: object

In [36]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [37]:
df

Unnamed: 0,label,message
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah dont think goe usf live around though
...,...,...
5567,1,nd time tri contact u u pound prize claim easi...
5568,0,b go esplanad fr home
5569,0,piti mood soani suggest
5570,0,guy bitch act like id interest buy someth el n...


In [38]:
import gensim.downloader as api

model = api.load('word2vec-google-news-300') 
# downloads ~1.6GB model



In [40]:
def avg_word_vec(sentence,model):
    words = sentence.lower().split()
    vectors = [model[word] for word in words if word in model]
    if not vectors:
        return np.zeros(model.vector_size)
    else :
        return np.mean(vectors,axis=0)

In [49]:
sentence_vectors = np.array([avg_word_vec(text, model) for text in df['message']])

In [50]:
sentence_vectors.shape

(5572, 300)

In [51]:
X = sentence_vectors

In [57]:
y= df['label'].to_numpy()

In [58]:
y.shape

(5572,)

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [59]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [60]:
clf = LogisticRegression()

In [61]:
clf.fit(X_train,y_train)

In [63]:
predictions = clf.predict(X_test)

In [66]:
score = accuracy_score(y_test,predictions)

In [67]:
score

0.9363228699551569

In [75]:
def predict_message_class(model,w2v_model,message):
    preprocessed_text = preprocessing(message)
    vector = avg_word_vec(preprocessed_text,w2v_model).reshape(1,300)
    prediction = model.predict(vector)
    if prediction == 0:
        print("classified as: ham")
    else:
        print("classified as: spam")

In [76]:
message = """Congratulations! 🎉 You have been selected as the LUCKY WINNER of our $1,000,000 MEGA CASH PRIZE! This once-in-a-lifetime opportunity was randomly awarded to your mobile number and email address. To CLAIM your prize, please confirm your details by clicking the link below within 24 HOURS:

👉 www.cashprize-claim-now.net/redeem

Failure to respond within 24 hours will result in the FORFEITURE of your prize. This is a confidential notification and MUST NOT be shared. We are awaiting your confirmation. Kindly send your FULL NAME, ADDRESS, DATE OF BIRTH, and BANK DETAILS to: claims@secure-winner-portal.com

Act NOW to become a MILLIONAIRE! 💰

*Note: This promotion is approved by the Global Rewards Foundation. Terms & conditions apply. Contact +1-888-555-1234 for assistance.*"""


In [77]:
predict_message_class(clf,model,message)

classified as: spam


## QUESTION 2
# Sentiment Classification

In [79]:
data = pd.read_csv("C:/Users/adity/Downloads/Tweets.csv/Tweets.csv")

In [85]:
data.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [86]:
data = data[['airline_sentiment','text']]

In [87]:
data

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [95]:
!pip install contractions
import contractions



In [98]:
def preprocess_tweet(text, remove_emojis=True):
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    if remove_emojis:
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags
            u"\U00002500-\U00002BEF"  # Chinese characters
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    clean_text = ' '.join(tokens)

    return clean_text

In [99]:
data['cleaned_text'] = data['text'].apply(preprocess_tweet)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cleaned_text'] = data['text'].apply(preprocess_tweet)


In [100]:
data['cleaned_text']

0                                                     said
1                   plus added commercial experience tacky
2                   today must mean need take another trip
3        really aggressive blast obnoxious entertainmen...
4                                     really big bad thing
                               ...                        
14635                   thank got different flight chicago
14637                        please bring american airline
14638    money change flight answer phone suggestion ma...
14639    8 people need 2 know many seat next flight plz...
Name: cleaned_text, Length: 14640, dtype: object

In [101]:
def avg_word_vec(sentence,model):
    words = sentence.lower().split()
    vectors = [model[word] for word in words if word in model]
    if not vectors:
        return np.zeros(model.vector_size)
    else :
        return np.mean(vectors,axis=0)

In [125]:
X_tweets = np.array([avg_word_vec(x,model) for x in data['cleaned_text']])

In [126]:
X_tweets.shape

(14640, 300)

In [106]:
from sklearn.preprocessing import OneHotEncoder

In [120]:
data['airline_sentiment_encoded'] = data['airline_sentiment'].map({'neutral':0,'positive':1,'negative':2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['airline_sentiment_encoded'] = data['airline_sentiment'].map({'neutral':0,'positive':1,'negative':2})


In [127]:
y_tweets=data['airline_sentiment_encoded'].to_numpy()

In [128]:
y_tweets

array([0, 1, 0, ..., 0, 2, 0], dtype=int64)

In [129]:
X_train_tweets,X_test_tweets,y_train_tweets,y_test_tweets=train_test_split(X_tweets,y_tweets,test_size=0.2)

In [135]:
model_mlr = LogisticRegression(multi_class= 'ovr')

In [136]:
model_mlr.fit(X_train_tweets,y_train_tweets)



In [137]:
predictions=model_mlr.predict(X_test_tweets)

In [138]:
score=accuracy_score(predictions,y_test_tweets)

In [140]:
score

0.766051912568306

In [141]:
def predict_tweet_sentiment(model,w2v_model,tweet):
    cleaned_tweet = preprocess_tweet(tweet)
    vector = avg_word_vec(cleaned_tweet).reshape(1,300)
    prediction = model.predict(vector)
    if prediction == 0:
        print("neutral")
    elif prediction ==1:
        print("positive")
    else:
        print("negative")
    

In [None]:
tweet = "Just had the smoothest flight ever with @Delta! ✈️💺 Incredible service, comfy seats, and landed 10 minutes early. #travel #airplaneexperience"