In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Data Cleaning & EDA

In [31]:
#Read in csv file
others = pd.read_csv('reddit_others_full.csv')

In [32]:
others.head()

Unnamed: 0,title,top_comment,answers
0,"Found this online, is it true?",- If you are interested in the live action for...,\n\nWithout knowing the source of the informat...
1,"One Piece Summarized, took 30 hours. Can you f...",I didn't think anyone could summarize the enti...,\n\nPandaman can be found in the One Piece man...
2,"You saved some of the treasure, right?",Zoro making a run for it and taking Sanji with...,"\n\nYes, I did save some of the treasure."
3,Where’s Pandaman? Took 71 hours. Can you find ...,"I love all the details put on this, like Zoro ...",\n\n1. In the upper right corner of the image....
4,Is it real ?,That's naruto's last chapter. Oda paying respe...,\n\nThat depends on what you are asking about.


In [33]:
#Start index at 1 instead of 0
#https://www.includehelp.com/python/pandas-start-row-index-from-1-instead-of-zero-without-creating-additional-column.aspx
others.index = np.arange(1, len(others) + 1)

In [35]:
#Rename index to row_id
#https://saturncloud.io/blog/how-to-name-your-pandas-index-column-a-guide-for-data-scientists/#:~:text=Method%201%3A%20Using%20the%20rename_axis()%20Method&text=In%20this%20example%2C%20we%20first,index%20column%20to%20'person'.
others.rename_axis('row_id', inplace=True)

In [36]:
others.head()

Unnamed: 0_level_0,title,top_comment,answers
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"Found this online, is it true?",- If you are interested in the live action for...,\n\nWithout knowing the source of the informat...
2,"One Piece Summarized, took 30 hours. Can you f...",I didn't think anyone could summarize the enti...,\n\nPandaman can be found in the One Piece man...
3,"You saved some of the treasure, right?",Zoro making a run for it and taking Sanji with...,"\n\nYes, I did save some of the treasure."
4,Where’s Pandaman? Took 71 hours. Can you find ...,"I love all the details put on this, like Zoro ...",\n\n1. In the upper right corner of the image....
5,Is it real ?,That's naruto's last chapter. Oda paying respe...,\n\nThat depends on what you are asking about.


In [39]:
print(others.at[1, 'answers'])



Without knowing the source of the information, it is impossible to know if it is true or not. It is always best to research any information you find online to make sure it is accurate and reliable.


In [42]:
#Removing white space from all answers
#https://www.geeksforgeeks.org/pandas-strip-whitespace-from-entire-dataframe/
others['answers'] = others['answers'].str.strip()
print(others.at[1, 'answers'])

Without knowing the source of the information, it is impossible to know if it is true or not. It is always best to research any information you find online to make sure it is accurate and reliable.


In [43]:
others.head()

Unnamed: 0_level_0,title,top_comment,answers
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"Found this online, is it true?",- If you are interested in the live action for...,"Without knowing the source of the information,..."
2,"One Piece Summarized, took 30 hours. Can you f...",I didn't think anyone could summarize the enti...,Pandaman can be found in the One Piece manga a...
3,"You saved some of the treasure, right?",Zoro making a run for it and taking Sanji with...,"Yes, I did save some of the treasure."
4,Where’s Pandaman? Took 71 hours. Can you find ...,"I love all the details put on this, like Zoro ...",1. In the upper right corner of the image.\n2....
5,Is it real ?,That's naruto's last chapter. Oda paying respe...,That depends on what you are asking about.


In [None]:
#Need to remove rows which include [deleted] in comment
#Need to remove rows which include '# Message to all users...'
#Delete ones which have [removed]
#Remove ones that start with 'Welcome'
#Remove NaN

#italics, bold, and links

#NLTK
    #Lemmatizer
    #StopWords


Need to make dataframe with a column containing both answers from humans and AI, and another column which contains a binary value which represents either human or AI. 

In [47]:
#New dataframe with answers from human
df1 = pd.DataFrame()
df1['answer'] = others['top_comment']
df1.head()

Unnamed: 0_level_0,answer
row_id,Unnamed: 1_level_1
1,- If you are interested in the live action for...
2,I didn't think anyone could summarize the enti...
3,Zoro making a run for it and taking Sanji with...
4,"I love all the details put on this, like Zoro ..."
5,That's naruto's last chapter. Oda paying respe...


In [48]:
#New column 'who_from' where 0 is human response, 1 is AI response
df1['who_from'] = 0
df1.head()

Unnamed: 0_level_0,answer,who_from
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,- If you are interested in the live action for...,0
2,I didn't think anyone could summarize the enti...,0
3,Zoro making a run for it and taking Sanji with...,0
4,"I love all the details put on this, like Zoro ...",0
5,That's naruto's last chapter. Oda paying respe...,0


In [49]:
#Dataframe for AI responses
df2 = pd.DataFrame()
df2['answer'] = others['answers']
df2['who_from'] = 1
df2.head()

Unnamed: 0_level_0,answer,who_from
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Without knowing the source of the information,...",1
2,Pandaman can be found in the One Piece manga a...,1
3,"Yes, I did save some of the treasure.",1
4,1. In the upper right corner of the image.\n2....,1
5,That depends on what you are asking about.,1


In [60]:
#Put df1 and df2 together
df = pd.concat([df1, df2])
df.head(500)

Unnamed: 0_level_0,answer,who_from
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,- If you are interested in the live action for...,0
2,I didn't think anyone could summarize the enti...,0
3,Zoro making a run for it and taking Sanji with...,0
4,"I love all the details put on this, like Zoro ...",0
5,That's naruto's last chapter. Oda paying respe...,0
...,...,...
110,Kawhi Leonard is the star of the Toronto Rapto...,1
111,?\n\nYes! Let's give a big shout out to our gu...,1
112,"No, we should not be that fanbase. We need to ...",1
113,"Yes, this is the dagger. It is a powerful arti...",1


In [61]:
#Need to fix row index
#https://pynative.com/pandas-reset-index/#:~:text=Use%20DataFrame.reset_index()%20function,-We%20can%20use&text=reset_index()%20to%20reset%20the,of%20numbers%20starting%20at%200.
df = df.reset_index()
df.head(200)

Unnamed: 0,row_id,answer,who_from
0,1,- If you are interested in the live action for...,0
1,2,I didn't think anyone could summarize the enti...,0
2,3,Zoro making a run for it and taking Sanji with...,0
3,4,"I love all the details put on this, like Zoro ...",0
4,5,That's naruto's last chapter. Oda paying respe...,0
...,...,...,...
195,82,,1
196,83,Isaiah Thomas is now an incredibly impressive ...,1
197,84,Giannis is obviously sad to see his friend go ...,1
198,85,,1


In [77]:
#Drop rows with NaN
df = df.dropna()
df.head(200)

Unnamed: 0,row_id,answer,who_from
0,1,- If you are interested in the live action for...,0
1,2,I didn't think anyone could summarize the enti...,0
2,3,Zoro making a run for it and taking Sanji with...,0
3,4,"I love all the details put on this, like Zoro ...",0
4,5,That's naruto's last chapter. Oda paying respe...,0
...,...,...,...
205,92,"No, Ben Simmons does not resemble Mona Lisa.",1
206,93,"Butler: ""No, not at all. We all worked hard to...",1
208,95,It's difficult to say why someone may not want...,1
209,96,"I'm sorry, I didn't quite catch that. Could yo...",1


In [78]:
#Split data into predictive variables and target variable
X = df['answer']
y = df['who_from']

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [81]:
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

In [82]:
y_pred = lr.predict(X_test_tfidf)

In [88]:
accuracy_score(y_test, y_pred)

0.8636363636363636

In [89]:
y_test.value_counts(normalize = True)

1    0.545455
0    0.454545
Name: who_from, dtype: float64

In [92]:
pd.Series(y_pred).value_counts(normalize = True)

0    0.545455
1    0.454545
dtype: float64