## 1) Import Libraries

In [33]:
import os
import numpy as np
import pandas as pd
from profanity import profanity 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

## 2) Import Data Set

In [34]:
df = pd.read_csv('data_set2.csv')

In [35]:
df.shape

(800, 20)

In [36]:
df.head()

Unnamed: 0,Id,Title,Text,Comment,Tags,PostTypeId,LastEditDate,Title.1,Tags.1,Reputation,CreationDate,Views,UpVotes,DownVotes,WebsiteUrl,Location,AboutMe,DisplayName,ApprovalDate,RejectionDate
0,666146,,How can I display an image or text whenever I ...,"Grammar, condensed text",,1,4/8/13 13:39,CSS tricky hover effect,<javascript><css>,8374,8/17/11 17:20,2662,2491,855,http://chrisforrence.com,"Atlanta, GA, United States",<h2>Howdy!</h2>\n\n<p>I'm a software engineer ...,Chris Forrence,4/8/13 13:39,
1,2752683,"""internal_metadta error"" when using Blockchain...",We are using Blockchain as a service on a Blue...,I made some minor edits to grammar and spelling.,,1,5/23/17 10:27,"""internal_metadta error"" when using Blockchain...",<ibm-cloud><blockchain><hyperledger>,1709,1/15/14 15:53,533,52,19,,"Austin, TX",<p>I am a Knowledge Manager for IBM Cloud plat...,William 'Bill' Wentworth,10/3/16 21:40,
2,675850,,I working on a `GIS` application which uses `P...,Formatting some texts,,1,4/15/13 12:05,How to set InsertCommand parameter as function...,<c#><postgresql><ado.net><postgis><dataadapter>,748,5/16/12 10:48,233,377,55,,,,Futuregeek,4/15/13 9:14,
3,2675420,,I have a war file with the below structure.\r\...,added missing characters,,1,8/18/16 13:06,Issue while reading a file from WAR file,<java>,1211,8/15/12 19:52,110,27,4,,,,Ömer Erden,8/18/16 13:06,
4,817177,,My code has generated the search string `veri_...,Fixed formatting and broken English,,1,7/22/13 20:14,How to break this while loop in apache poi get...,<java><while-loop><apache-poi>,3292,6/18/12 17:57,1012,5738,248,,4444,"<p><a href=""http://stackoverflow.com/users/146...",4444,7/22/13 18:31,


## 3)Data Preparation

### - Cleaning some features:

In [37]:
##Comments' Features:

#There are two types of posts that can be edited
# I use 0 for Editing a question
# I use 1 for editing an answer
df.loc[df['PostTypeId'] == 1, 'PostTypeId'] = 0
df.loc[df['PostTypeId'] == 2, 'PostTypeId'] = 1

#Checks if the post was edited before
df['LastEditDate']=df['LastEditDate'].fillna(0)
df.loc[df['LastEditDate'] != 0, 'LastEditDate'] = 1

#Comments Length
df['CommentLength'] = df['Comment'].apply(len)

#Check if the title of the post was edited
df["TitleChange1"] = df['Title'].fillna('False')
df.loc[df['TitleChange1'] != 'False', 'TitleChange1'] = 'True'

df.loc[df['Title'] == df['Title.1'], "TitleChange2"] = 'True'
df.loc[df['Title'] != df['Title.1'], "TitleChange2"] = 'False'

df.loc[df['TitleChange1'] == 'False', "TitleChange1"] = 0
df.loc[df['TitleChange1'] == 'True', "TitleChange1"] = 1

df.loc[df['TitleChange2'] == 'False', "TitleChange2"] = 0
df.loc[df['TitleChange2'] == 'True', "TitleChange2"] = 1

df['TitleChange'] = df['TitleChange1']^df['TitleChange2']

# check for profanity in the comments and the editions
df['CommentProfanity'] = df['Comment'].apply(lambda x: profanity.contains_profanity(x))
df['Text']=df['Text'].fillna('0')
df['TextProfanity'] = df['Text'].apply(lambda x: profanity.contains_profanity(x))

In [38]:
#Check if the tags of the post were edited
df['Tags.1'] = df['Tags.1'].map(lambda x: str(x).lstrip('<').rstrip('>'))
df['Tags.1'] = df['Tags.1'].str.replace('><', ' ')

#Check if the title of the post was edited
df["TagChange1"] = df['Tags'].fillna('False')
df.loc[df["TagChange1"] != 'False', "TagChange1"] = 'True'

df.loc[df['Tags'] == df['Tags.1'], "TagChange2"] = 'True'
df.loc[df['Tags'] != df['Tags.1'], "TagChange2"] = 'False'

df.loc[df["TagChange1"] == 'False', "TagChange1"] = 0
df.loc[df["TagChange1"] == 'True', "TagChange1"] = 1

df.loc[df["TagChange2"] == 'False', "TagChange2"] = 0
df.loc[df["TagChange2"] == 'True', "TagChange2"] = 1

df['TagChange'] = df["TagChange1"]^df["TagChange2"]

In [39]:
##User's Features

#The user has a WebstieURL
df['WebsiteUrl']=df['WebsiteUrl'].fillna(0)
df.loc[df['WebsiteUrl'] != 0, 'WebsiteUrl'] = 1

#The user stated a Location
df['WebsiteUrl']=df['WebsiteUrl'].fillna(0)
df.loc[df['Location'] != 0, 'Location'] = 1

#the user wrote an AboutMe
df['AboutMe']=df['AboutMe'].fillna(0)
df.loc[df['AboutMe'] != 0, 'AboutMe'] = 1

In [40]:
## Output
#output 0 notApprove, 1 approve
df['Y'] = df['ApprovalDate'].fillna(0)
df.loc[df['Y'] != 0, 'Y'] = 1

### - Organizing the data in a new dataframe

In [41]:
data = pd.DataFrame()

In [42]:
##Comments' Features:
#Qestion 0, answer 1
data['PostType']= df['PostTypeId']
# Not Edited before 0, Edited before 1
data['Edited']= df['LastEditDate']
#length of comment
data['LenComment']= np.log(df['CommentLength']+1)
#Title Change
data['TitleChange'] = 'Nan'
data.loc[df['TitleChange'] == True, 'TitleChange'] = 1
data.loc[df['TitleChange'] == False, 'TitleChange'] = 0
#Tag Change
data['TagChange'] = 'Nan'
data.loc[df['TagChange'] == True, 'TagChange'] = 1
data.loc[df['TagChange'] == False, 'TagChange'] = 0
#CommentProfanity
data['ComProf'] = 'Nan'
data.loc[df['CommentProfanity'] == True, 'ComProf'] = 1
data.loc[df['CommentProfanity'] == False, 'ComProf'] = 0
#TextProfanity
data['TxtProf'] = 'Nan'
data.loc[df['TextProfanity'] == True, 'TxtProf'] = 1
data.loc[df['TextProfanity'] == False, 'TxtProf'] = 0

In [43]:
##User's Features

#Total Reputation
data['Reputation']= np.log(df['Reputation']+1)
#totalUpvotes
data['UpVotes']= np.log(df['UpVotes']+1)
#totalDownVotes
data['DownVotes']= np.log(df['DownVotes']+1)
#Completion of profile 0 nothing 3 all complete
data['ProfileCompletion'] = df['Location'] + df['AboutMe'] + df['WebsiteUrl']

In [44]:
#output
data['Output'] = df['Y']

In [45]:
data.shape

(800, 12)

In [46]:
data.head()

Unnamed: 0,PostType,Edited,LenComment,TitleChange,TagChange,ComProf,TxtProf,Reputation,UpVotes,DownVotes,ProfileCompletion,Output
0,0,1,3.178054,0,0,0,0,9.033006,7.820841,6.75227,3,1
1,0,1,3.89182,0,0,0,0,7.444249,3.970292,2.995732,2,1
2,0,1,3.091042,0,0,0,0,6.618739,5.934894,4.025352,1,1
3,0,1,3.218876,0,0,0,0,7.100027,3.332205,1.609438,1,1
4,0,1,3.583519,0,0,0,0,8.099554,8.65504,5.517453,2,1


## 4)  Make a test/train split of the data

In [47]:
X = data.drop('Output',axis=1)
Y = data['Output']

In [48]:
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.30, random_state=seed)
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

## 5) Normalise data

In [49]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 6) Testing ML Algorithm

### Classification Reports

In [50]:
from sklearn.metrics import classification_report

#### a) Logistic Regression:

In [28]:
logistic = LogisticRegression()
logistic.fit(X_train, Y_train)
prediction = logistic.predict(X_test)
print(classification_report(Y_test, prediction ))

             precision    recall  f1-score   support

          0       0.82      0.61      0.70       129
          1       0.65      0.85      0.74       111

avg / total       0.74      0.72      0.72       240



#### b) Support Vector Machine:

In [29]:
svm = SVC()
svm.fit(X_train, Y_train)
prediction = svm.predict(X_test)
print(classification_report(Y_test, prediction ))

             precision    recall  f1-score   support

          0       0.87      0.58      0.70       129
          1       0.65      0.90      0.75       111

avg / total       0.77      0.73      0.72       240



#### c) Multi-layer Perceptron:

In [32]:
mlp = MLPClassifier(alpha=10,hidden_layer_sizes=(40,40,40,40),max_iter=1000)
mlp.fit(X_train, Y_train)
prediction = mlp.predict(X_test)
print(classification_report(Y_test, prediction ))

             precision    recall  f1-score   support

          0       0.86      0.61      0.71       129
          1       0.66      0.88      0.76       111

avg / total       0.77      0.74      0.73       240

