In [93]:
from google.colab import drive
drive.mount('/content/drive')
     

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [94]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import re

In [96]:
np.random.seed(500)

In [97]:
def clean_text(input):
  remove_symbols = re.compile('[^0-9A-Za-z ]') 
  input = remove_symbols.sub(' ', str(input)) 
  # remove  empty spaces
  re.sub('\s+', '' ,input)
  return input
     

In [98]:
df_extracted = pd.read_csv('/content/drive/MyDrive/Dataset/train/sarcastic_tweets.csv')
df_extracted.drop(columns=df_extracted.columns[0], axis=1, inplace=True)
# Extract the sarcastic tweets only
df_sarcastic = df_extracted[df_extracted['sarcastic']==1]

# Extract the non sarcastic rephrase of the sarcastic tweets
df_rephrase = df_extracted[df_extracted['sarcastic'] == 0]
# Concatenate all non sarcastic tweets in a single dataframe
# df_not_sarcastic = pd.concat([df_not_sarcastic,rephrase],axis=0, ignore_index=True )

In [None]:

# Remove any non alphanumeric characters and any trailing white spaces
df_sarcastic['tweet'] = df_sarcastic['tweet'].apply(clean_text)
df_rephrase['tweet'] = df_rephrase['tweet'].apply(clean_text)

df_sarcastic['tweet'] = df_sarcastic['tweet'].astype(str)
df_sarcastic['tweet'] = df_sarcastic['tweet'].astype(str)

# lower case all letters
df_sarcastic['tweet'] = [entry.lower() for entry in df_sarcastic['tweet']]
df_rephrase['tweet'] = [entry.lower() for entry in df_rephrase['tweet']]

# break tweets into a set of words
df_sarcastic['tweet']= [word_tokenize(entry) for entry in df_sarcastic['tweet']]
df_rephrase['tweet']= [word_tokenize(entry) for entry in df_rephrase['tweet']]

In [100]:
df_rephrase.head(2)

Unnamed: 0,tweet,sarcastic
867,"[college, is, really, difficult, expensive, ti...",0
868,"[i, do, not, like, when, professors, don, t, w...",0


In [101]:
df_test = pd.read_csv('/content/drive/MyDrive/Dataset/test/Task-C/task_C_En_test.csv') 

 # Remove any non alphanumeric characters and any trailing white spaces
df_test['text_0'] = df_test['text_0'].apply(clean_text)
df_test['text_1'] = df_test['text_1'].apply(clean_text)

df_test['text_0'] = df_test['text_0'].astype(str)
df_test['text_1'] = df_test['text_1'].astype(str)

# lower case all letters
df_test['text_0'] = [entry.lower() for entry in df_test['text_0']]
df_test['text_1'] = [entry.lower() for entry in df_test['text_1']]

# break tweets into a set of words
df_test['text_0']= [word_tokenize(entry) for entry in df_test['text_0']]
df_test['text_1']= [word_tokenize(entry) for entry in df_test['text_1']]
     

In [102]:
df_test.head(2)

Unnamed: 0,text_0,text_1,sarcastic_id
0,"[i, see, that, your, team, played, well, today]","[i, m, sorry, that, your, team, didn, t, win, ...",0
1,"[anthony, taylor, is, such, a, fair, referee, ...","[i, hope, anthony, taylor, is, never, put, in,...",0


In [103]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [104]:
df_final = pd.DataFrame(columns = ['tweet_final', 'sarcastic'])
separator = "SEP"

In [105]:
count = 0
for element_1, element_2 in zip(df_sarcastic['tweet'], df_rephrase['tweet']):
  final_words = []
  word_lemmatized = WordNetLemmatizer()
  for word, tag in pos_tag(element_1):
    if word not in stopwords.words('english') and word.isalpha():
      word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
      final_words.append(word_final)
  final_words.append(separator)
  for word, tag in pos_tag(element_2):
    if word not in stopwords.words('english') and word.isalpha():
      word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
      final_words.append(word_final)
  df_final.loc[count,'tweet_final'] = str(final_words)
  df_final.loc[count,'sarcastic'] = 0
  count = count + 1

In [106]:
for element_1, element_2 in zip(df_sarcastic['tweet'], df_rephrase['tweet']):
  final_words = []
  word_lemmatized = WordNetLemmatizer()
  for word, tag in pos_tag(element_2):
    if word not in stopwords.words('english') and word.isalpha():
      word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
      final_words.append(word_final)
  final_words.append(separator)
  for word, tag in pos_tag(element_1):
    if word not in stopwords.words('english') and word.isalpha():
      word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
      final_words.append(word_final)
  df_final.loc[count,'tweet_final'] = str(final_words)
  df_final.loc[count,'sarcastic'] = 1
  count = count + 1

In [107]:
count = 0
for element_1, element_2 in zip(df_test['text_0'], df_test['text_1']):
  final_words = []
  word_lemmatized = WordNetLemmatizer()
  for word, tag in pos_tag(element_1):
    if word not in stopwords.words('english') and word.isalpha():
      word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
      final_words.append(word_final)
  final_words.append(separator)
  for word, tag in pos_tag(element_2):
    if word not in stopwords.words('english') and word.isalpha():
      word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
      final_words.append(word_final)
  df_test.loc[count,'tweet_final'] = str(final_words)
  count = count + 1

In [108]:
Encoder = LabelEncoder()
train_y = Encoder.fit_transform(df_final['sarcastic'])
test_y = Encoder.fit_transform(df_test['sarcastic_id'])
train_y

array([0, 0, 0, ..., 1, 1, 1])

In [109]:
df_final.loc[1733,'tweet_final']

"['unfortunately', 'one', 'cookery', 'skill', 'make', 'able', 'adult', 'properly', 'SEP', 'might', 'rubbish', 'drive', 'less', 'stellar', 'career', 'really', 'good', 'make', 'spanish', 'omelette', 'know']"

In [110]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(df_final['tweet_final'])

train_x = tfidf_vect.transform(df_final['tweet_final'])
test_x = tfidf_vect.transform(df_test['tweet_final'])

In [111]:
print(len(tfidf_vect.vocabulary_))

3910


In [112]:
tfidf_vect_test = TfidfVectorizer()
tfidf_vect_test.fit(df_test['tweet_final'])
print(len(tfidf_vect_test.vocabulary_))

1288


In [113]:
count = 0
present = []
absent = []
for key in tfidf_vect_test.vocabulary_:
  if key in tfidf_vect.vocabulary_:
    count = count + 1
    present.append(key)
  else:
    absent.append(key)

count

879

In [123]:
SVM = svm.SVC(C=5.0, kernel='linear')
SVM.fit(train_x, train_y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(test_x)
print(classification_report(df_test['sarcastic_id'],predictions_SVM))

              precision    recall  f1-score   support

           0       0.58      0.53      0.55       107
           1       0.50      0.55      0.53        93

    accuracy                           0.54       200
   macro avg       0.54      0.54      0.54       200
weighted avg       0.54      0.54      0.54       200



In [124]:
print(confusion_matrix(df_test['sarcastic_id'],predictions_SVM))

[[57 50]
 [42 51]]


In [125]:
df_results = pd.DataFrame()
df_results['True-value'] = df_test['sarcastic_id']
df_results['Prediction'] = predictions_SVM
df_results.to_csv("/content/drive/MyDrive/Dataset/test/Task-C/svm-results.csv")