# EDA - Combining data from all 6 websites

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [69]:
# create a function to open files from json

def load_json(filename):
  with open(filename, 'r') as f:
    data = json.load(f)
  return pd.DataFrame(data)

In [70]:
# load all 6 dataframes

champkoi = load_json('champkoi.json')
gckoi = load_json('gckoi.json')
grandkoi = load_json('grandkoi.json')
kloubec = load_json('kloubec.json')
nextdaykoi = load_json('nextdaykoi.json')
sacramentokoi = load_json('sacramentokoi.json')

In [71]:
# create a function to put all the info from .info into a dataframe

def info_to_df(dataframe, abbreviation):
  # get the datatypes
  dtypes = dataframe.dtypes
  # get the non-null counts
  counts = dataframe.count()
  # create a new dataframe
  return pd.DataFrame({'Columns': dtypes.index, f'Non-Null_{abbreviation}': counts.values})

In [72]:
# create info dfs for all the dataframes

champkoi_info = info_to_df(champkoi, 'champkoi')
gckoi_info = info_to_df(gckoi, 'gckoi')
grandkoi_info = info_to_df(grandkoi, 'grandkoi')
kloubec_info = info_to_df(kloubec, 'kloubec')
nextdaykoi_info = info_to_df(nextdaykoi, 'nextdaykoi')
sacramentokoi_info = info_to_df(sacramentokoi, 'nextdaykoi')


In [73]:
# outer merge all the dataframes together

df = pd.merge(champkoi_info, gckoi_info, on='Columns', how='outer')
df = pd.merge(df, grandkoi_info, on='Columns', how='outer')
df = pd.merge(df, kloubec_info, on='Columns', how='outer')
df = pd.merge(df, nextdaykoi_info, on='Columns', how='outer')
df = pd.merge(df, sacramentokoi_info, on='Columns', how='outer')
df

Unnamed: 0,Columns,Non-Null_champkoi,Non-Null_gckoi,Non-Null_grandkoi,Non-Null_kloubec,Non-Null_nextdaykoi_x,Non-Null_nextdaykoi_y
0,age,,402.0,,,,
1,breeder,35.0,402.0,361.0,,,
2,category,185.0,30.0,,,1948.0,423.0
3,certificate,,402.0,,,,
4,cleaned_strings,185.0,,,,,
5,description,,402.0,,,,
6,details,,,367.0,,,
7,gender,33.0,402.0,,,149.0,156.0
8,gpt_dict,,408.0,,,,
9,image,,,367.0,517.0,1948.0,423.0


In [74]:
# change the index to 'Columns'

df = df.set_index('Columns')

# transpose the df

df.T


Columns,age,breeder,category,certificate,cleaned_strings,description,details,gender,gpt_dict,image,...,price,sku,split,tags,tags_from_category,tags_from_title,tags_list,title,uuid,variety
Non-Null_champkoi,,35.0,185.0,,185.0,,,33.0,,,...,185.0,185.0,185.0,185.0,,,185.0,185.0,185.0,185.0
Non-Null_gckoi,402.0,402.0,30.0,402.0,,402.0,,402.0,408.0,,...,408.0,408.0,,408.0,,,,408.0,408.0,
Non-Null_grandkoi,,361.0,,,,,367.0,,,367.0,...,346.0,367.0,,367.0,,,,367.0,367.0,
Non-Null_kloubec,,,,,,,,,,517.0,...,517.0,517.0,,517.0,,,,517.0,517.0,
Non-Null_nextdaykoi_x,,,1948.0,,,,,149.0,,1948.0,...,1948.0,1910.0,,1948.0,1948.0,1948.0,,1948.0,1948.0,
Non-Null_nextdaykoi_y,,,423.0,,,,,156.0,,423.0,...,423.0,422.0,,422.0,423.0,422.0,,423.0,423.0,


In [75]:
# check the transposed df for nulls

df.T.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Non-Null_champkoi to Non-Null_nextdaykoi_y
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 1 non-null      float64
 1   breeder             3 non-null      float64
 2   category            4 non-null      float64
 3   certificate         1 non-null      float64
 4   cleaned_strings     1 non-null      float64
 5   description         1 non-null      float64
 6   details             1 non-null      float64
 7   gender              4 non-null      float64
 8   gpt_dict            1 non-null      float64
 9   image               4 non-null      float64
 10  image_path          2 non-null      float64
 11  image_url           6 non-null      float64
 12  in_stock            2 non-null      float64
 13  length_cm           1 non-null      float64
 14  length_inches       6 non-null      float64
 15  link                6 non-null