# Importing Packages

In [20]:
import pandas as pd

# Reading the data files

In [21]:
questions_df = pd.read_csv("../data/Questions.csv", encoding='latin-1')
answers_df = pd.read_csv("../data/Answers.csv", encoding='latin-1')

# Dropping all the unanswered questions

In [22]:
# create a boolean mask of rows to keep
mask = questions_df['Id'].isin(answers_df['ParentId'])

# drop the rows that don't match the condition
questions_df = questions_df[mask]
questions_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [23]:
questions_df.Id.nunique() == answers_df.ParentId.nunique() # Check if the number of unique Ids in both dataframes are the same
print("Is the number of unique Ids in both dataframes the same: ", questions_df.Id.nunique() == answers_df.ParentId.nunique())

Is the number of unique Ids in both dataframes the same:  True


#### Now we have successfully removed all the unanswered questions from the dataset. And the number of unique questions is equal to the number of answers in the dataset.

# Cleaning thhe HTML tags from the Bodies of the questions and answers

In [24]:
# Removing all the HTML tags from the body of the questions
questions_df['Body'] = questions_df['Body'].str.replace(r'<.*?>', '')
questions_df.head()

  questions_df['Body'] = questions_df['Body'].str.replace(r'<.*?>', '')


Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,I am using the Photoshop's javascript API to f...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,I have a cross-platform (Python) application w...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,I'm starting work on a hobby project with a py...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,There are several ways to iterate over a resul...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,I don't remember whether I was dreaming or not...


In [25]:
# Removing all the HTML tags from the body of the answers
answers_df['Body'] = answers_df['Body'].str.replace(r'<.*?>', '')
answers_df.head()

  answers_df['Body'] = answers_df['Body'].str.replace(r'<.*?>', '')


Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,open up a terminal (Applications-&gt;Utilities...
1,518,153.0,2008-08-02T17:42:28Z,469,2,I haven't been able to find anything that does...
2,536,161.0,2008-08-02T18:49:07Z,502,9,You can use ImageMagick's convert utility for ...
3,538,156.0,2008-08-02T18:56:56Z,535,23,One possibility is Hudson. It's written in Ja...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"We run Buildbot - Trac at work, I haven't used..."


#### The HTML tags have been removed from the dataset.

# Let's now drop all the unnecessary columns from the dataset

In [26]:
questions_df = questions_df.drop(['OwnerUserId', 'CreationDate', 'Score'], axis=1)
questions_df.head()

Unnamed: 0,Id,Title,Body
0,469,How can I find the full path to a font from it...,I am using the Photoshop's javascript API to f...
1,502,Get a preview JPEG of a PDF on Windows?,I have a cross-platform (Python) application w...
2,535,Continuous Integration System for a Python Cod...,I'm starting work on a hobby project with a py...
3,594,cx_Oracle: How do I iterate over a result set?,There are several ways to iterate over a resul...
4,683,Using 'in' to match an attribute of Python obj...,I don't remember whether I was dreaming or not...


In [27]:
answers_df = answers_df.drop(['OwnerUserId', 'CreationDate', 'Score'], axis=1)
answers_df.head()

Unnamed: 0,Id,ParentId,Body
0,497,469,open up a terminal (Applications-&gt;Utilities...
1,518,469,I haven't been able to find anything that does...
2,536,502,You can use ImageMagick's convert utility for ...
3,538,535,One possibility is Hudson. It's written in Ja...
4,541,535,"We run Buildbot - Trac at work, I haven't used..."


In [28]:
questions_df.to_csv("../data/Questions_cleaned.csv", index=False)