# Importing all the necesary packages


In [None]:
import re
import string
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading the Dataset

In [3]:
trial_data = pd.read_csv("C:\\Users\\aravi\\Desktop\\Desktop\\SemEval-2020-master\\SemEval-2020-master\\TrialData\\data1.csv", sep=',')
train_data = pd.read_csv("C:\\Users\\aravi\\Desktop\\Desktop\\SemEval-2020-master\\SemEval-2020-master\\TrainData\\data_7000_new.csv", sep=',', names=['image_name', 'Image_URL', 'OCR_extracted_text', 'corrected_text', 'Humour', 'Sarcasm', 'offensive', 'Motivational', 'Overall_Sentiment', 'Basis_of_classification'])

In [4]:
train_data = train_data[train_data.Overall_Sentiment != 'neutral']
train_data = train_data[~train_data.Overall_Sentiment.isnull()]
trial_data = trial_data[trial_data.Overall_Sentiment != 'neutral']
trial_data = trial_data[~trial_data.Overall_Sentiment.isnull()]

In [5]:
print(train_data.shape)
print(trial_data.shape)

(4448, 10)
(637, 10)


# Functions to Removing Punctuation, Numbers, and Special Characters 

In [6]:
def clean_tweets(tweet):
    tweet = re.sub('@(\\w{1,15})\b', '', str(tweet))
    tweet = tweet.replace("via ", "")
    tweet = tweet.replace("RT ", "")
    tweet = tweet.lower()
    return tweet
    
def clean_url(tweet):
    tweet = re.sub('http\\S+', '', tweet, flags=re.MULTILINE)   
    return tweet
    
def remove_stop_words(tweet):
    stops = set(stopwords.words("english"))
    stops.update(['.',',','"',"'",'?',':',';','(',')','[',']','{','}'])
    toks = [tok for tok in tweet if not tok in stops and len(tok) >= 3]
    return toks
    
def stemming_tweets(tweet):
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in tweet]
    return stemmed_words

def remove_number(tweet):
    newTweet = re.sub('\\d+', '', tweet)
    return newTweet

def remove_hashtags(tweet):
    result = ''

    for word in tweet.split():
        if word.startswith('#') or word.startswith('@'):
            result += word[1:]
            result += ' '
        else:
            result += word
            result += ' '

    return result

# Data Preprocessing

In [7]:
def preprocessing(tweet, swords = True, url = True, stemming = True, ctweets = True, number = True, hashtag = True):

    if ctweets:
        tweet = clean_tweets(tweet)

    if url:
        tweet = clean_url(tweet)

    if hashtag:
        tweet = remove_hashtags(tweet)
    
    twtk = TweetTokenizer(strip_handles=True, reduce_len=True)

    if number:
        tweet = remove_number(tweet)
    
    tokens = [w.lower() for w in twtk.tokenize(tweet) if w != "" and w is not None]

    if swords:
        tokens = remove_stop_words(tokens)

    if stemming:
        tokens = stemming_tweets(tokens)

    text = " ".join(tokens)

    return text

In [8]:
train_text  = train_data['corrected_text'].map(lambda x: preprocessing(x, swords = True, url = True, stemming = True, ctweets = True, number = True, hashtag = True))
s_train     = train_data['Overall_Sentiment']

trial_text  = trial_data['corrected_text'].map(lambda x: preprocessing(x, swords = True, url = True, stemming = True, ctweets = True, number = True, hashtag = True))
s_trial     = trial_data['Overall_Sentiment']

In [9]:
print(len(train_text), len(s_train))
print(len(trial_text), len(s_trial))

4448 4448
637 637


In [10]:
def bag_of_words(train, test):
    vec = CountVectorizer(analyzer='word', binary=True, min_df=1, max_features=25000)
    train = vec.fit_transform(train).toarray()
    test = vec.transform(test).toarray()
    return train, test

In [11]:
x_train, x_test = bag_of_words(train_text, trial_text)

In [12]:
print(x_train.shape)
print(x_test.shape)
print(len(s_train), len(s_trial))

(4448, 7590)
(637, 7590)
4448 637


# Defining the Classifier

In [13]:
clf = LinearSVC(C=0.1)

clf.fit(x_train, s_train)

y_pred = clf.predict(x_test)

In [14]:
print("F1.........: %f" %(f1_score(s_trial, y_pred, average="macro")))
print("Precision..: %f" %(precision_score(s_trial, y_pred, average="macro")))
print("Recall.....: %f" %(recall_score(s_trial, y_pred, average="macro")))
print("Accuracy...: %f" %(accuracy_score(s_trial, y_pred)))

F1.........: 0.215056
Precision..: 0.503283
Recall.....: 0.199376
Accuracy...: 0.687598


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
