In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import re

In [None]:
np.random.seed(500)

In [5]:
def clean_text(input):
  remove_symbols = re.compile('[^0-9A-Za-z ]') 
  input = remove_symbols.sub(' ', str(input)) 
  # remove  empty spaces
  re.sub('\s+', '' ,input)
  return input

In [65]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/train/train.En.csv')
df.head(10)

df_extracted = pd.read_csv('/content/drive/MyDrive/Dataset/train/sarcastic_tweets.csv')
df_extracted.drop(columns=df_extracted.columns[0], axis=1, inplace=True)
# Extract the sarcastic tweets only
df_sarcastic = df_extracted[df_extracted['sarcastic']==1]
# Extract the non sarcastic tweets from the original training dataset 
df_not_sarcastic = df[df['sarcastic'] == 0][['tweet', 'sarcastic']]
# Extract the non sarcastic rephrase of the sarcastic tweets
rephrase = df_extracted[df_extracted['sarcastic'] == 0]
# Concatenate all non sarcastic tweets in a single dataframe
# df_not_sarcastic = pd.concat([df_not_sarcastic,rephrase],axis=0, ignore_index=True )

count = df_sarcastic.shape[0]

# Oversample the sarcastic tweets
df_sarcastic_over = df_sarcastic.sample(count * 4, replace=True)

# # Undersample the non sarcastic tweets
# df_not_sarcastic_under = df_not_sarcastic.sample(count, replace=False)

#  Concatenate the sarcastic tweets (oversampled), the rephrase and the non sarcastic tweets (undersampled)
df = pd.concat([df_sarcastic_over, rephrase, df_not_sarcastic],axis=0, ignore_index=True)

In [None]:
df.head(10)

In [67]:
# Remove any non alphanumeric characters and any trailing white spaces
df['tweet'] = df['tweet'].apply(clean_text)

df['tweet'] = df['tweet'].astype(str)
# lower case all letters
df['tweet'] = [entry.lower() for entry in df['tweet']]
# break tweets into a set of words
df['tweet']= [word_tokenize(entry) for entry in df['tweet']]

In [69]:
df_test = pd.read_csv('/content/drive/MyDrive/Dataset/test/Task-A/task_A_En_test.csv') 

 # Remove any non alphanumeric characters and any trailing white spaces
df_test['tweet'] = df_test['tweet'].apply(clean_text)

df_test['tweet'] = df_test['tweet'].astype(str)
# lower case all letters
df_test['tweet'] = [entry.lower() for entry in df_test['tweet']]
# break tweets into a set of words
df_test['tweet']= [word_tokenize(entry) for entry in df_test['tweet']]

In [None]:
df_test

In [71]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [72]:
for index, element in enumerate(df['tweet']):
  final_words = []
  word_lemmatized = WordNetLemmatizer()
  for word, tag in pos_tag(element):
    if word not in stopwords.words('english') and word.isalpha():
      word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
      final_words.append(word_final)
  df.loc[index, 'tweet_final'] = str(final_words)

for index, element in enumerate(df_test['tweet']):
  test_final_words = []
  word_lemmatized = WordNetLemmatizer()
  for word, tag in pos_tag(element):
    if word not in stopwords.words('english') and word.isalpha():
      test_word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
      test_final_words.append(test_word_final)
  df_test.loc[index, 'tweet_final'] = str(test_final_words)

In [73]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(df['tweet_final'])

train_x = tfidf_vect.transform(df['tweet_final'])
test_x = tfidf_vect.transform(df_test['tweet_final'])

In [39]:
tfidf_vect_test = TfidfVectorizer()
tfidf_vect_test.fit(df_test['tweet_final'])
print(len(tfidf_vect_test.vocabulary_))

3537


In [74]:
print(len(tfidf_vect.vocabulary_))

8507


In [76]:
count = 0
present = []
absent = []
for key in tfidf_vect_test.vocabulary_:
  if key in tfidf_vect.vocabulary_:
    count = count + 1
    present.append(key)
  else:
    absent.append(key)

count


2175

In [77]:
# tfidf_vect.vocabulary_
len(absent)

1362

In [98]:
SVM = svm.SVC(C=5.0, kernel='sigmoid')
SVM.fit(train_x, df['sarcastic'])

# predict the labels on validation dataset
predictions_SVM = SVM.predict(test_x)
print(classification_report(df_test['sarcastic'],predictions_SVM))
# # Use accuracy_score function to get the accuracy
# print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

              precision    recall  f1-score   support

           0       0.89      0.68      0.77      1200
           1       0.21      0.51      0.30       200

    accuracy                           0.66      1400
   macro avg       0.55      0.60      0.54      1400
weighted avg       0.80      0.66      0.71      1400



In [99]:
print(confusion_matrix(df_test['sarcastic'],predictions_SVM))

[[819 381]
 [ 98 102]]


In [101]:
# df.to_csv('/content/drive/MyDrive/Dataset/test/df.csv')
# df_test.to_csv('/content/drive/MyDrive/Dataset/test/df_test.csv')
df_results = pd.DataFrame()
df_results['True-Value'] = df_test['sarcastic']
df_results['Prediction'] = predictions_SVM
df_results.head(10)
df_test.to_csv('/content/drive/MyDrive/Dataset/test/Task-A/task-a-svm-oversampled.csv')