In [285]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
import json
import pandas as pd
import numpy as np
import random
import string
from string import punctuation
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.metrics import accuracy_score

In [286]:
filename = "dataset/web_science_dataset.jsonl"
json_data = []
with open(filename) as f:
    json_data = f.readlines()
json_data_list = []
for item in json_data:
    json_data_list.append(json.loads(item))
df_ = pd.DataFrame(json_data_list)

In [287]:
df = pd.read_csv('cleaned_data.csv', sep = ',')
df.head()
df.shape

(957, 7)

In [288]:
map_ = dict(zip(df_['questionId'].astype(int),df_['answer']))
df['answer'] = df['questionId'].map(map_)

map_cat = dict(zip(df_['questionId'].astype(int),df_['categoryId']))
df['categoryId'] = df['questionId'].map(map_cat)

df.head()

Unnamed: 0,Question,Answer URL,Answer Label,Question Rating,Answer Quality,Factual,questionId,answer,categoryId
0,does water have a memory as claimed in homeopathy,https://skeptics.stackexchange.com/questions/2#27,no,1.666667,2.333333,1,2,No\n\nWater forms strong intermolecular forces...,3
1,does chamomile help you to relax,https://skeptics.stackexchange.com/questions/3...,na,2.0,2.0,1,3,There is a website by the NIH about Chamomile ...,0
2,are there benefits to the eca stack for bodybu...,https://skeptics.stackexchange.com/questions/2...,yes,1.666667,2.0,1,22,It appears that the combination of ephedrine a...,0
3,can positive thinking provide an improved outc...,https://skeptics.stackexchange.com/questions/2...,yes,2.666667,2.666667,1,26,To add to Krzysztof's answer. There were also ...,2
4,are vegetables good for me,https://skeptics.stackexchange.com/questions/3...,yes,2.0,2.333333,1,32,"\n From a young age, most people are told tha...",0


In [289]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return (' '.join(lemmatizer.lemmatize(w) for w in word_tokenize(text)))

In [290]:
def preprocess_text(df):    

    # remove leading/trailing spaces
    df = df.str.strip()
    
    # convert to lowercase
    df = df.str.lower()
    
    df = df.replace(to_replace ='http\S+', value = '', regex = True)
    
    # remove punctuation
    translator = str.maketrans('', '', string.punctuation) 
    df = df.str.translate(translator)
    
    # remove non-alphanumeric characters
    df = df.replace(to_replace ='\s*[^A-Za-z0-9]+\s*', value = ' ', regex = True)
    
    # remove digits
    translator = str.maketrans('', '', string.digits) 
    df = df.str.translate(translator)
    
    df = df.str.strip()
    
    #lemmatize
    #df = df.apply(lemmatize_text)
    
    return df

In [291]:
# convert to lower case, remove leading/trailing spaces
df['Question'] = df['Question'].astype(str).str.lower().str.strip()
df['Answer Label'] = df['Answer Label'].astype(str).str.lower().str.strip()

In [292]:
df['Question'] = preprocess_text(df['Question'])
df['answer'] = preprocess_text(df['answer'])
print(np.unique(df['Answer Label']))
df['Answer Label'] = df['Answer Label'].replace('nan','na')

answer_label_map = {'yes':2,'no':0,'na':1}
df['Answer Label'] = df['Answer Label'].map(answer_label_map)
#df['Answer Quality'] = ((df['Answer Quality']-1)/(3-1))

['na' 'no' 'yes']


In [293]:
for i, row in df.iterrows():
    text = remove_stopwords(row['answer'])
    #print(text)
    new_text = " ".join(text.split(" ")[:])
    #print(new_text)
    df.at[i,'answer'] = new_text

mean_answer_lengths = df['answer'].apply(lambda x: np.mean(len(x)))
mean_answer_lengths

0       799.0
1       344.0
2      1275.0
3      1210.0
4      1620.0
        ...  
952    1568.0
953    1316.0
954     260.0
955    2624.0
956     841.0
Name: answer, Length: 957, dtype: float64

In [294]:
def read_file(filename):
    with open(filename) as f:
        data = f.readlines()
        data = [int(i.strip()) for i in data ]
    return data

In [295]:
train_ids = read_file('quality_prediction_data/training_ids.txt')
test_ids = read_file('quality_prediction_data/testing_ids.txt')
train_data = df.loc[df['questionId'].isin(train_ids)]
test_data = df.loc[df['questionId'].isin(test_ids)]

In [296]:
from sklearn.model_selection import train_test_split

#train_data, valid_data = train_test_split(df,stratify=df['categoryId'].values, test_size=0.2)
#train_data.shape
#valid_data.shape
#train_data.head()

In [297]:
unique_labels = set(train_data['Answer Label'])
unique_labels
label_map = {v:k for k,v in enumerate(unique_labels)}
label_map

{0: 0, 1: 1, 2: 2}

In [298]:
corpus = train_data['answer']
labels = train_data['Answer Label']

In [299]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
Z = vectorizer.transform(test_data['answer']).toarray()

In [300]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [301]:
test_predictions = clf.predict(Z)
accuracy = accuracy_score(test_data['Answer Label'], test_predictions)

In [302]:
accuracy

0.38144329896907214

In [303]:
vectorizer = TfidfVectorizer(stop_words="english",max_features=None) #
response = vectorizer.fit_transform(corpus)
response = response.todense()
Z = vectorizer.transform(test_data['answer'])
Z = Z.todense()

In [304]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(response, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [305]:
test_predictions = clf.predict(Z)
accuracy = accuracy_score(test_data['Answer Label'], test_predictions)

In [306]:
accuracy

0.4072164948453608