In [0]:
#Importing packages here 
import os 
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re 
from nltk.metrics.distance import edit_distance
from sklearn.feature_extraction.text import TfidfVectorizer
from tpot import TPOTClassifier

In [0]:
#Mounting google drive 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#Changing the working directory to this
#This cell needs to be executed once only otherwise it will enter the except block
try:
  os.chdir(os.getcwd()+"/drive/My Drive/IC/IC Project")
except:
    print("Already in the specified folder")


In [0]:
df = pd.read_csv("Data.csv",lineterminator='\n',index_col=0)

In [0]:
#Seperating the x,y 
x = df['Tweet']
y = df['Sentiment']

In [0]:
#Setting fixed random seed for consistent results
np.random.seed(0)

In [0]:
#Now need to do pre-processing steps
#Step 1: Case folding more popularly known as lowercasing :) 
x = x.str.lower()

In [0]:
#Step 2: Need to remove stopwords
stop = []
with open('stop.txt','r') as f:
    for word in tqdm(f):
      stop.append(word.strip())
#print(stop)
for word in tqdm(stop):
  x = x.replace(to_replace=r'\b{}\b'.format(word),value="",regex=True)

x = x.str.split()
#print(x)
       

102it [00:00, 102153.54it/s]
100%|██████████| 102/102 [00:04<00:00, 23.51it/s]


In [0]:
x = x.drop([16904],axis=0) #this becomes nan so dropping it
y = y.drop([16904],axis=0) #also dropping this

In [0]:
#dropping the rows that become empty too after cleaning
drop = []
for word,i in zip(x,range(len(x))):
  if len(word)==0:  
      drop.append(i)
  

In [0]:
x = x.drop(drop,axis=0)
y = y.drop(drop,axis=0)

In [0]:
#Calculating edit distance first and removing words which have edit distance of 1 with the stop-words due to stop-words corpus being too small
distance = []
for word in tqdm(stop):
  dist = []
  for line in x:
    linelist = []
    for l in line[:]:
      d = edit_distance(word,l)
      if d==1:
        line.remove(l)
#Dropping the words which have length=0
result = x[x.apply(len)==0].index

y = y.drop(result,axis=0)
x = x.drop(result,axis=0)


100%|██████████| 102/102 [04:17<00:00,  2.53s/it]


In [0]:
#Step 4: Splitting into traning and testing data but first setting the random state to get consistent result everytime 
X_train,X_test,y_train,y_test = train_test_split(x,y)

In [0]:
#Step 5: Get count of words for the training data.
pseudoDict = {}
for values in X_train:
    for word in values:
        if word not in pseudoDict.keys():
            pseudoDict[word] = 1
        else:
            pseudoDict[word] = pseudoDict[word]+1

In [0]:
stopwords = []
threshold = 100
for val in pseudoDict:
    if pseudoDict[val]>threshold:
        stopwords.append(val)
print(stopwords)
#Removing these stopwords from the training corpus


['kabhi', '.', 'allah', 'time', 'muslim']


In [0]:
#Step 7: Words that are present too often are also stop words.Need to remove the words after seeing what words are those
for word in tqdm(stopwords):
  for linelist in X_train:
    for token in linelist[:]:
      if token in stopwords:
        linelist.remove(token)

result = X_train[X_train.apply(len)==0].index
y_train = y_train.drop(result,axis=0)
X_train = X_train.drop(result,axis=0)
X_train = X_train.str.join(" ")
X_test = X_test.str.join(" ")

100%|██████████| 5/5 [00:00<00:00, 34.46it/s]


In [0]:
X_test

1891                    shobey qadam aehal ba’kamaal sabit
1629                                              shukria.
2545         shuru salon gaeyki tasawwuf peyghamat bharpor
2593                                 bachpan singing shauq
11922    ithad intaykhabat numayan aksaryat kamyabi nov...
                               ...                        
8743                               assingment mukmal enjoy
7230     apni company film arts bainer taley 1966 wahee...
12889             physical appearance comment guraiz kren.
714      .zulfiqar bhutto khahis unhain wazir kharja mo...
16964                         means rozay buht acahy guzar
Name: Tweet, Length: 4962, dtype: object

In [0]:
X_train

3596                                                beshak
5767     edhi free cancer hospital tariq road cancer 91...
8670                       hahhhh ufffff... mere ajaoo :-d
2270     unhon sarehy panch saw zaid moqablon mosalsel ...
4224                                         phans jaao 😁😁
                               ...                        
11740    bani hotel jahan tehri wahan airport 15 minute...
4260              khalaq bezata hahahahahaha osho apricots
13113                             itna bura read more hide
10887    news parties resignation beghair wapis jayengi...
6783                                                unziii
Name: Tweet, Length: 14693, dtype: object

In [0]:
#Step 8: Create a tf-idf representation of the training corpus 
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train) 
X_test_vec = vectorizer.transform(X_test)

In [0]:
#Need to do this to complete IC Project deliverable. Use tpot bish
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=3, random_state=0,warm_start=True,periodic_checkpoint_folder="BestPipelines",config_dict="TPOT sparse") 


In [0]:
tpot.fit(X_train_vec, y_train)

13 operators have been imported by TPOT.


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…

Skipped pipeline #9 due to time out. Continuing to the next pipeline.
Skipped pipeline #16 due to time out. Continuing to the next pipeline.
Skipped pipeline #21 due to time out. Continuing to the next pipeline.
Created new folder to save periodic pipeline: BestPipelines
Saving periodic pipeline from pareto front to BestPipelines/pipeline_gen_1_idx_0_2020.03.27_19-41-32.py
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='squared_hinge' are not supported when dual=True, Parameters: penalty='l1', loss='squared_hinge', dual=True.
_pre_test decorator: _random_mutation_operator: num_test=1 A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array..
_pre_test decorator: _random_mutation_operator: num_test=0 A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array..
_pre_test decorator: _random_mutation_operator: num

TPOTClassifier(config_dict='TPOT sparse', crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder='BestPipelines', population_size=20,
               random_state=0, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=3, warm_start=True)

In [0]:
tpot.evaluated_individuals_

{'BernoulliNB(CombineDFs(input_matrix, CombineDFs(input_matrix, input_matrix)), BernoulliNB__alpha=0.1, BernoulliNB__fit_prior=True)': {'crossover_count': 0,
  'generation': 'INVALID',
  'internal_cv_score': 0.6038245319916589,
  'mutation_count': 3,
  'operator_count': 1,
  'predecessor': ('LinearSVC(CombineDFs(input_matrix, CombineDFs(input_matrix, input_matrix)), LinearSVC__C=20.0, LinearSVC__dual=False, LinearSVC__loss=squared_hinge, LinearSVC__penalty=l1, LinearSVC__tol=0.0001)',)},
 'BernoulliNB(CombineDFs(input_matrix, RFE(input_matrix, RFE__ExtraTreesClassifier__criterion=entropy, RFE__ExtraTreesClassifier__max_features=0.3, RFE__ExtraTreesClassifier__n_estimators=100, RFE__step=0.9000000000000001)), BernoulliNB__alpha=0.01, BernoulliNB__fit_prior=True)': {'crossover_count': 0,
  'generation': 'INVALID',
  'internal_cv_score': -inf,
  'mutation_count': 4,
  'operator_count': 2,
  'predecessor': ('BernoulliNB(CombineDFs(input_matrix, input_matrix), BernoulliNB__alpha=0.01, Berno

In [0]:
d = tpot.evaluated_individuals_ #All models evaluated

In [0]:
d.keys() #d is a dict

dict_keys(['LinearSVC(input_matrix, LinearSVC__C=0.0001, LinearSVC__dual=False, LinearSVC__loss=squared_hinge, LinearSVC__penalty=l2, LinearSVC__tol=0.0001)', 'BernoulliNB(SelectPercentile(input_matrix, SelectPercentile__percentile=88), BernoulliNB__alpha=0.001, BernoulliNB__fit_prior=True)', 'BernoulliNB(input_matrix, BernoulliNB__alpha=0.01, BernoulliNB__fit_prior=True)', 'MultinomialNB(input_matrix, MultinomialNB__alpha=100.0, MultinomialNB__fit_prior=True)', 'BernoulliNB(input_matrix, BernoulliNB__alpha=0.1, BernoulliNB__fit_prior=False)', 'MultinomialNB(input_matrix, MultinomialNB__alpha=0.1, MultinomialNB__fit_prior=False)', 'XGBClassifier(input_matrix, XGBClassifier__learning_rate=0.5, XGBClassifier__max_depth=4, XGBClassifier__min_child_weight=19, XGBClassifier__n_estimators=100, XGBClassifier__nthread=1, XGBClassifier__subsample=0.7500000000000001)', 'KNeighborsClassifier(RFE(input_matrix, RFE__ExtraTreesClassifier__criterion=gini, RFE__ExtraTreesClassifier__max_features=0.750

In [0]:
models_df = pd.DataFrame(data=d) #Creating a dataframe and saved it in modelInfo.csv
#models_df.to_csv("modelInfo.csv")

In [0]:
#Step 9: Do SVD on the tf-idf representation to reduce dimensionality


In [0]:
#Step 10: Do ridge regression for feature selection to reduce dimensionality

In [0]:
#Step 11: Do lasso regression for feature selection to reduce dimensionality 

In [0]:
#Step 12: Create a multi-input feed forward neural network using features obtained from step 9,10,11 and train it 

In [0]:
#Step 13: Do hyperparameter tunning using talos 

In [0]:
#Step 14: Do 