In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.model_selection import GridSearchCV
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/viviek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/viviek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/viviek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/viviek/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Functions to pre-process the data

In [2]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def normalize_text(words):
    words = to_lowercase(words)
    return words

def tokenize(text):
    return nltk.word_tokenize(text)

def text_prepare(text):
    text = ' '.join([x for x in normalize_text(tokenize(text))])
    return text

## Reading and preprocessing 2k labelled data

In [3]:
df=pd.read_csv('sp+Ip+sn+In.csv')
df['text'] = [text_prepare(x) for x in df['text']]
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
df

Unnamed: 0,text,label
0,its holy everyone up is quiet palmdale bbw ove...,0
1,may you all have a peaceful tiredbuthappy deca...,0
2,costco new york is my a tastes great and paire...,0
3,each time you shoot a zero you have to give me...,0
4,facebook strikes again – i liked a few pages o...,1
...,...,...
2154,a comment from my tutor about the recent �no m...,1
2155,doing my christmas shopping i noticed a perfec...,0
2156,and ive said im a feminist before but people j...,0
2157,god rescue is the best thing you could ever as...,0


In [4]:
df['text'] = [entry.lower() for entry in df['text']]
df['text']= [word_tokenize(entry) for entry in df['text']]

In [5]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [6]:
two_k_data  = df
for index,entry in enumerate(two_k_data['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    two_k_data.loc[index,'text'] = str(Final_words)
two_k_data.head()

Unnamed: 0,text,label
0,"['holy', 'everyone', 'quiet', 'palmdale', 'bbw...",0
1,"['may', 'peaceful', 'tiredbuthappy', 'decathlo...",0
2,"['costco', 'new', 'york', 'taste', 'great', 'p...",0
3,"['time', 'shoot', 'zero', 'give', 'kiss']",0
4,"['facebook', 'strike', 'like', 'page', 'compan...",1


In [7]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(two_k_data['text'],two_k_data['label'],test_size=0.2)

In [8]:
X_train=X_train.tolist()
y_train=y_train.tolist()
X_test=X_test.tolist()
y_test=y_test.tolist()

In [10]:
# tfidf_transformer = TfidfVectorizer(analyzer='char',max_features = 6000,ngram_range = (1,6))
tfidf_transformer = TfidfVectorizer(max_features = 6000)

tfidf_transformer.fit(two_k_data['text'])
train_x = tfidf_transformer.transform(X_train)
test_x= tfidf_transformer.transform(X_test)

## Training the labelled data to get best model

In [11]:
c = [0.1, 1, 10]
hyperparameters = {'C': c, 'kernel': ['linear']}  
SVM = svm.SVC(probability=True)
res = GridSearchCV(SVM, hyperparameters)
bestmodel = res.fit(train_x,y_train)

### Results of labelled data

In [12]:
pred = bestmodel.predict(test_x)

print("Accuracy: ",accuracy_score( y_test,pred)*100)
print("F1 Score: ",f1_score(y_test,pred, average="weighted")*100)
print("Precision: ",precision_score(y_test, pred, average="weighted")*100)

Accuracy:  82.4074074074074
F1 Score:  81.73569023569024
Precision:  82.41292434440398


## Reading and Preprocessing the 23.9k data

In [13]:
df2=pd.read_csv('23900.csv')
df2.columns = ["text"]
df2['text'] = [text_prepare(x) for x in df2['text']]
df2['text'] = [entry.lower() for entry in df2['text']]
df2['text']= [word_tokenize(entry) for entry in df2['text']]

In [14]:
df2.shape

(23900, 1)

In [15]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [16]:
unlabelled_data  = df2
for index,entry in enumerate(unlabelled_data['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    unlabelled_data.loc[index,'text'] = str(Final_words)
unlabelled_data.head()

Unnamed: 0,text
0,['wish']
1,"['goooood', 'girl', 'uh', 'huh', 'please', 'ar..."
2,"['body', 'shaming', 'one', 'cruel', 'unnecessa..."
3,"['hey', 'glowin', 'presentin', 'new', 'body', ..."
4,"['presentin', 'new', 'body', 'scrubs', 'body',..."


In [17]:
# temp = unlabelled_data

## First Iteration

### Predicting labels using already trained model

In [18]:
# tfidf_transformer = TfidfVectorizer(analyzer='char',max_features = 6000,ngram_range = (1,6))
tfidf_transformer = TfidfVectorizer(max_features = 6000)

tfidf_transformer.fit(unlabelled_data['text'])
data = tfidf_transformer.transform(unlabelled_data['text'])

results_prob = bestmodel.predict_proba(data)
results_label = bestmodel.predict(data)

In [19]:
# for i in range(len(results_prob)):
#     print(results_prob[i][0]," ",results_prob[i][1])

### Adding the selected data with a high degree of certainty ( threshold of 0.8 and 0.95 here ) to the labelled data and removing them from unlabelled data

In [21]:
ddict = {'text' : [] , 'label':[] }
a=[]
for i in range(len(results_prob)):
#     print(results_prob[i][1]," ",results_prob[i][1])
    if(results_prob[i][1] >= 0.8):
        a.append(i)
        ddict['text'].append(unlabelled_data.iloc[i]['text'])
        ddict['label'].append(1)
    elif(results_prob[i][0] >= 0.95):
        a.append(i)
        ddict['text'].append(unlabelled_data.iloc[i]['text'])
        ddict['label'].append(0)

In [23]:
to_append = pd.DataFrame(ddict)

upd_labelled_data = two_k_data
upd_labelled_data = upd_labelled_data.append(to_append)
upd_labelled_data = upd_labelled_data.reset_index(drop=True)

unlabelled_data=unlabelled_data.drop(unlabelled_data.index[a])
unlabelled_data = unlabelled_data.reset_index(drop=True)

### Size of updated labelled data and unlabelled data

In [24]:
print(upd_labelled_data.shape)
print(unlabelled_data.shape)

(6613, 2)
(19446, 1)


### After adding data to labelled data,  training the model again

In [25]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(upd_labelled_data['text'],upd_labelled_data['label'],test_size=0.2)

In [26]:
# tfidf_transformer = TfidfVectorizer(analyzer='char',max_features = 6000,ngram_range = (1,6))
tfidf_transformer = TfidfVectorizer(max_features = 6000)

tfidf_transformer.fit(upd_labelled_data['text'])
train_x = tfidf_transformer.transform(X_train)
test_x= tfidf_transformer.transform(X_test)

In [27]:
c = [0.1, 1, 10]
hyperparameters = {'C': c,'kernel': ['linear']}  
SVM = svm.SVC(probability=True)
res = GridSearchCV(SVM, hyperparameters)
bestmodel = res.fit(train_x,y_train)
pred = bestmodel.predict(test_x)

In [28]:
print("Accuracy: ",accuracy_score( y_test,pred)*100)
print("F1 Score: ",f1_score(y_test,pred, average="weighted")*100)
print("Precision: ",precision_score(y_test, pred, average="weighted")*100)

Accuracy:  91.83673469387756
F1 Score:  90.57205225044179
Precision:  91.09693877551021


## Second Iteration

### Predicting labels using already trained model

In [29]:
# tfidf_transformer = TfidfVectorizer(analyzer='char',max_features = 6000,ngram_range = (1,6))
tfidf_transformer = TfidfVectorizer(max_features = 6000)

tfidf_transformer.fit(unlabelled_data['text'])
data = tfidf_transformer.transform(unlabelled_data['text'])
results_prob_2 = bestmodel.predict_proba(data)
results_label_2 = bestmodel.predict(data)

In [30]:
# for i in range(len(results_prob_2)):
#     print(results_prob_2[i][0]," ",results_prob_2[i][1])

### Adding the selected data with a high degree of certainty ( threshold of 0.8 and 0.97 here ) to the labelled data and removing them from unlabelled data

In [31]:
a=[]
ddict = {'text' : [] , 'label':[] }
for i in range(len(results_prob_2)):
    if(results_prob_2[i][1] >= 0.8):
        a.append(i)
        ddict['text'].append(unlabelled_data.iloc[i]['text'])
        ddict['label'].append(1)
    elif(results_prob_2[i][0] >= 0.97):
        a.append(i)
        ddict['text'].append(unlabelled_data.iloc[i]['text'])
        ddict['label'].append(0)

In [32]:
to_append = pd.DataFrame(ddict)

upd_labelled_data_2 = upd_labelled_data
upd_labelled_data_2 = upd_labelled_data_2.append(to_append)
upd_labelled_data_2 = upd_labelled_data_2.reset_index(drop=True)

unlabelled_data = unlabelled_data.drop(unlabelled_data.index[a])
unlabelled_data = unlabelled_data.reset_index(drop=True)

### Size of updated labelled data and unlabelled data

In [33]:
print(len(upd_labelled_data_2))
print(len(unlabelled_data))

8401
17658


### After adding data to labelled data,  training the model again

In [34]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(upd_labelled_data_2['text'],upd_labelled_data_2['label'],test_size=0.2)

In [35]:
# tfidf_transformer = TfidfVectorizer(analyzer='char',max_features = 6000,ngram_range = (1,6))
tfidf_transformer = TfidfVectorizer(max_features = 6000)

tfidf_transformer.fit(upd_labelled_data_2['text'])
train_x = tfidf_transformer.transform(X_train)
test_x= tfidf_transformer.transform(X_test)

In [36]:
c = [0.1, 1, 10]
hyperparameters = {'C': c,'kernel': ['linear']}  
SVM = svm.SVC(probability=True)
res = GridSearchCV(SVM, hyperparameters)
bestmodel = res.fit(train_x,y_train)
pred = bestmodel.predict(test_x)

In [37]:
print("Accuracy: ",accuracy_score( y_test,pred)*100)
print("F1 Score: ",f1_score(y_test,pred, average="weighted")*100)
print("Precision: ",precision_score(y_test, pred, average="weighted")*100)

Accuracy:  92.62343842950625
F1 Score:  91.18084545425656
Precision:  91.42122614970874


## Third Iteration

### Predicting labels using already trained model

In [38]:
# tfidf_transformer = TfidfVectorizer(analyzer='char',max_features = 6000,ngram_range = (1,6))
tfidf_transformer = TfidfVectorizer(max_features = 6000)

tfidf_transformer.fit(unlabelled_data['text'])
data = tfidf_transformer.transform(unlabelled_data['text'])
results_prob_3 = bestmodel.predict_proba(data)
results_label_3 = bestmodel.predict(data)

In [39]:
# for i in range(len(results_prob_3)):
#     print(results_prob_3[i][0]," ",results_prob_3[i][1])

### Adding the selected data with a high degree of certainty ( threshold of 0.8 and 0.98 here ) to the labelled data and removing them from unlabelled data

In [40]:
ddict = {'text' : [] , 'label':[] }
a=[]
for i in range(len(results_prob_3)):
    if(results_prob_3[i][1] >= 0.8):
        a.append(i)
        ddict['text'].append(unlabelled_data.iloc[i]['text'])
        ddict['label'].append(1)
    elif(results_prob_3[i][0] >= 0.98):
        a.append(i)
        ddict['text'].append(unlabelled_data.iloc[i]['text'])
        ddict['label'].append(0)

In [41]:
to_append = pd.DataFrame(ddict)

upd_labelled_data_3 = upd_labelled_data_2
upd_labelled_data_3 = upd_labelled_data_3.append(to_append)
upd_labelled_data_3 = upd_labelled_data_3.reset_index(drop=True)

unlabelled_data=unlabelled_data.drop(unlabelled_data.index[a])
unlabelled_data = unlabelled_data.reset_index(drop=True)

### Size of updated labelled data and unlabelled data

In [43]:
print(len(upd_labelled_data_3))
print(len(unlabelled_data))

10981
15078


### After adding data to labelled data,  training the model again

In [44]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(upd_labelled_data_3['text'],upd_labelled_data_3['label'],test_size=0.2)

In [45]:
# tfidf_transformer = TfidfVectorizer(analyzer='char',max_features = 6000,ngram_range = (1,6))
tfidf_transformer = TfidfVectorizer(max_features = 6000)

tfidf_transformer.fit(upd_labelled_data_3['text'])
train_x = tfidf_transformer.transform(X_train)
test_x= tfidf_transformer.transform(X_test)

In [46]:
c = [0.1, 1, 10]
hyperparameters = {'C': c,'kernel': ['linear']}  
SVM = svm.SVC(probability=True)
res = GridSearchCV(SVM, hyperparameters)
bestmodel = res.fit(train_x,y_train)
pred = bestmodel.predict(test_x)

In [47]:
print("Accuracy: ",accuracy_score( y_test,pred)*100)
print("F1 Score: ",f1_score(y_test,pred, average="weighted")*100)
print("Precision: ",precision_score(y_test, pred, average="weighted")*100)

Accuracy:  93.94629039599454
F1 Score:  92.68173288941033
Precision:  93.19939540826478
