In [None]:
import numpy as np
import pandas as pd
import os
import nltk
from gensim.models import Word2Vec
import re
import multiprocessing

In [None]:
df=pd.read_csv("post.csv")

In [None]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
def post_tokenizing(df):
    token_list=[]
    phrase_list=[]
    token_df=pd.DataFrame()
    token_df.insert(0,'Post',None)
    token_df.insert(1,'class',None)
    for val in df.values:
        append_list=[]
        filter_val=re.sub(r'Q:','',val[0])
        filter_val=re.sub(r'&#039;[a-z]{1}','',filter_val)
        filter_val=re.sub('<[a-z]+>',' ',filter_val).lower()
        value=re.sub(r'[^a-zA-Z\s]', '', filter_val, re.I|re.A)
        filter_tokens=[token for token in wpt.tokenize(value) if token not in stop_words and len(token)>=3]
        if(filter_tokens):
            join_words=' '.join(filter_tokens)
            append_list.append(join_words)
            append_list.append(val[1])
            token_df.loc[len(token_df)]=append_list
    return token_df

In [None]:
def post_vector_calculation(token_df):   
    cores = multiprocessing.cpu_count()
    docs_vectors = pd.DataFrame() 
    token_list=[]
    for val in token_df.values:
        token_list.append(val[0].split(' '))
    embeddings=Word2Vec(token_list,min_count=20,
                     window=2,
                     size=50,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)
    count=0
    for doc in token_list:
        count=count+1
        temp = pd.DataFrame() 
        for word in doc: 
            if word in embeddings.wv.vocab:
                word_vec = embeddings[word] 
                temp = temp.append(pd.Series(word_vec), ignore_index = True) 
        doc_vector = temp.mean() 
        docs_vectors = docs_vectors.append(doc_vector, ignore_index = True) 
        print(count)
    docs_vectors['class']=token_df['class']
    return docs_vectors

In [None]:
token_df=post_tokenizing(df)
print(token_df.shape)
docs_vectors=post_vector_calculation(token_df) 
docs_vectors.head(5)

In [None]:
docs_vectors=docs_vectors.dropna()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(docs_vectors.drop('class', axis = 1),docs_vectors['class'],test_size = 0.2,
                                                   random_state = 42)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
print(y_train.shape,y_test.shape)

In [None]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [None]:
model = AdaBoostClassifier(n_estimators=800, random_state = 42)
model.fit(X_train, y_train)
test_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, test_pred)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.model_selection import GridSearchCV
pipe = Pipeline(steps=[('svm', svm.SVC(probability=True))])
param_grid = {
    'svm__C': [0.1],  
    'svm__gamma': [1], 
    'svm__kernel': ['rbf']}
search = GridSearchCV(pipe, param_grid, cv=2, iid=False, refit=True)
search.fit(X_train, y_train)
print("Best CV score = %0.3f:" % search.best_score_)
print("Best parameters: ", search.best_params_)

SVM_best_params = search.best_params_
SVM_best_model = search.best_estimator_
print(SVM_best_model)
print(SVM_best_params)

In [None]:
test_pred_svm=search.predict(X_test)
accuracy_score(y_test,test_pred_svm)

In [None]:
df_test=pd.read_csv("cleanprojectdataset.csv")
token_test_df=post_tokenizing(df_test)
docs_vectors_test=post_vector_calculation(token_test_df) 
pri

In [None]:
print(docs_vectors_test.head(5))
print(token_test_df['class'].shape)
print(token_test_df['class'])

In [None]:
test_data=docs_vectors_test.drop('class', axis = 1).to_numpy()
test_data=np.nan_to_num(test_data)
test_pred_test=search.predict(test_data)
df_test_mod=token_test_df['class']
df_test_mod=df_test_mod.astype('int')
#df_test_mod=df_test_mod.drop(df_test_mod.index[65])
accuracy_score(df_test_mod, test_pred_test)

In [None]:
test_pred_test=model.predict(test_data)
df_test_mod=df_test['class']
df_test_mod=df_test_mod.drop(df_test_mod.index[65])
accuracy_score(df_test_mod, test_pred_test)