# Cleaning Data/ Organizing it to DataFrame

#### Importing neccessary libraries

In [2]:
import pandas as pd
import json

import numpy as np
import time
np.random.seed(42)


import matplotlib.pyplot as plt

### Importing my raw json file into this notebook in order to extract the information I want from it and organize it

In [3]:
with open('../Data/stock_random.json', 'r') as f:
    stock_posts = json.load(f)
with open('../Data/crypto_random.json', 'r') as f:
    crypto_posts = json.load(f)

In [4]:
crypto_df = pd.DataFrame(crypto_posts)
stock_df = pd.DataFrame(stock_posts)

 ## Defining a function to extract specific features  ~ In this case, it is SelfText , Title , & which subreddit each item belongs to

In [5]:
def subreddit_content(df,content):
    contents =[]
    for i, x in enumerate(df['data']):
        contents.append(x[content])
    df[content] = contents
    return

In [6]:
subreddit_content(stock_df, 'selftext')
subreddit_content(stock_df, 'title')
subreddit_content(stock_df, 'subreddit')

In [7]:
subreddit_content(crypto_df, 'selftext')
subreddit_content(crypto_df, 'title')
subreddit_content(crypto_df, 'subreddit')

### Once my new columns were added to my seperate subreddit dataframes, I dropped the original 'Data' & 'Kind' columns to have my self, title, and subreddit columns only.

In [8]:
stock_text_title = stock_df.drop(columns = stock_df[['data', 'kind']])
crypto_text_title = crypto_df.drop(columns = crypto_df[['data', 'kind']])


In [9]:
stock_text_title.shape , crypto_text_title.shape

((2229, 3), (2217, 3))

### I Also dropped duplicates in my data

In [10]:
crypto_text_title.drop_duplicates(inplace = True)
stock_text_title.drop_duplicates(inplace = True)

#### Concatenated both subreddit dataframes together

In [11]:
stock_crypto_data = pd.concat([stock_text_title, crypto_text_title], 
           axis = 0,
          join = 'outer',
          ignore_index = True)

### Creating a numerical term for my subreddit column and naming it 'y' & adding it to my dataframe to be used in other notebooks for modeling

In [12]:
y = []
for i in stock_crypto_data.subreddit:
    if i == 'StockMarket':
        y.append(1)
    else:
        y.append(0)
stock_crypto_data['y'] = y

##### Viewing my concatenated dataframe to make sure it has all my features before exporting it to a csv

In [13]:
stock_crypto_data.shape

(2121, 4)

In [14]:
stock_crypto_data.tail(3)

Unnamed: 0,selftext,title,subreddit,y
2118,,"Goldman Sachs Folds Weak Hand; No Big Deal, Sa...",CryptoCurrency,0
2119,,Philippine Lawmakers Plan to Publish Crypto Ex...,CryptoCurrency,0
2120,,Daily FUD: Cryptocurrency markets are in meltd...,CryptoCurrency,0


#### Exporting to csv with index 'False' in order to not create another unnamed column

In [15]:
stock_crypto_data.to_csv('../Data/stock_crypto_data.csv', index = False)
stock_text_title.to_csv('../Data/stock_text_title.csv' , index = False)
crypto_text_title.to_csv('../Data/crypto_text_title.csv' , index = False)