# Data Collection Pipeline

- Creates **data/kickstart_data_merged_with_empty.csv**
- Next file to run: Data Cleaning.ipynb

In [1]:
import pandas as pd 
import os 
import json
import re

# Data Import

## Import data from scraping from Kickstarter GraphQL endpoint

In [2]:
start = "data/"

with open(start + "0_44134_story.json", "r") as f:
    story_list1 = json.load(f)
with open(start + "0_44134_risk.json", "r") as f:
    risk_list1 = json.load(f)
with open(start + "44135_81404_story.json", "r") as f:
    story_list2 = json.load(f)
with open(start + "44135_81404_risk.json", "r") as f:
    risk_list2 = json.load(f)
with open(start + "81404_122106_story.json", "r") as f:
    story_list3 = json.load(f)
with open(start + "81404_122106_risk.json", "r") as f:
    risk_list3 = json.load(f)
with open(start + "162808_203510_story.json", "r") as f:
    story_list5 = json.load(f)
with open(start + "162808_203510_risk.json", "r") as f:
    risk_list5 = json.load(f)

story_list = story_list1 + story_list2
risk_list = risk_list1 + risk_list2

## Import data from Web Robots (Web scraping service with public datasets)

In [None]:
robot_df = pd.read_csv("data/web_robots.csv")

## Import data from WebScraper.io Scraping (Web scraping service)

In [None]:
base = "data/"
webscraper_df = pd.read_excel(base + "kickstarter_by_categories.xlsx")
graph_df1 = pd.read_csv(base + "bandy.csv")
graph_df2 = pd.read_csv(base + "merged_ivan_valentin.csv")
graph_df3 = pd.read_csv(base + "df1_radell.csv")

graph_df2_mapping = graph_df2[['ivan_index', 'valentin_index']]
webscraper_df = webscraper_df.drop_duplicates('Link-href')

# Data Merge

## Merge GraphQL Data (Story + Risk) with Webrobots Dataset to get Dataset X

In [None]:
# Generating Webrobots + GraphQL datasets

# graph1 + webrobots
df_robot_join = robot_df.merge(graph_df1, on="id", how="left")
df_robot_join['final_index'] = range(len(df_robot_join))

if len(story_list) == len(risk_list):
    for i, item in enumerate(story_list):
        df_robot_join.loc[df_robot_join['final_index'] == i, ['story']] = story_list[i]
        df_robot_join.loc[df_robot_join['final_index'] == i, ['risk']] = risk_list[i]
else:
    print("check story and risk list")

# graph3 + webrobots
graph_df3['graph_df3_index'] = range(len(graph_df3))
graph_df3 = graph_df3[(graph_df3['graph_df3_index'] >= 162808) & (graph_df3['graph_df3_index'] < 203510)]
graph_df3 = graph_df3[['graph_df3_index', 'id']]
graph_df3_range = df_robot_join[(df_robot_join['final_index'] >= 162808) & (df_robot_join['final_index'] < 203510)]
graph_df3_merged = graph_df3_range.merge(graph_df3,on='id',how='inner')

graph_df3_mapping = graph_df3_merged[['final_index', 'graph_df3_index']].sort_values('graph_df3_index')
base_num = 162808

for idx, row in graph_df3_mapping.iterrows():
    df_robot_join.loc[df_robot_join['final_index'] == row['final_index'], ['story']] = story_list5[row['graph_df3_index'] - base_num]
    df_robot_join.loc[df_robot_join['final_index'] == row['final_index'], ['risk']] = risk_list5[row['graph_df3_index'] - base_num]

# graph2 + webrobots    
graph_df2_mapping = graph_df2_mapping.sort_values('valentin_index')
base_num = 81404

for idx, row in graph_df2_mapping.iterrows():
    df_robot_join.loc[df_robot_join['final_index'] == row['ivan_index'], ['story']] = story_list3[row['valentin_index'] - base_num]
    df_robot_join.loc[df_robot_join['final_index'] == row['ivan_index'], ['risk']] = risk_list3[row['valentin_index'] - base_num]

# Get dataset_x: Exclude graphql data that could not fully merge with webrobots
dataset_x = df_robot_join[pd.notnull(df_robot_join['story'])]

## Merge Final GraphQL Dataset (Story + Risk) with Webscraper Dataset (Rewards)

In [None]:
# parsing join key 
dataset_y = webscraper_df
dataset_y['Link-href'] = dataset_y['Link-href'].apply(lambda x: str(x)[:str(x).find("?ref")])
dataset_x['main_url'] = dataset_x['main_url'].apply(lambda x: str(x)[:str(x).find("?ref")])
combined_dataset = dataset_y.merge(dataset_x, left_on="Link-href", right_on="main_url", how="left")

## FILL IN MISSING DATA HERE
combined_dataset.to_csv("data/kickstart_data_merged_with_empty.csv")

# Exclude data in dataset X that could not fully merge with dataset Y
final_combined_dataset = combined_dataset[pd.notnull(combined_dataset['story'])] 

# Export

In [None]:
final_combined_dataset.to_csv("data/kickstarter_data_merged.csv")