In [1]:
import pandas as pd
import numpy as np

def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]
    
def clean_data(df, *argv):
    df1 = df.copy()
    for series in argv:
        df1[series] = df1[series].str.lower()
        df1[series] = df1[series].str.replace("\n","")
        df1[series] = df1[series].str.replace("\r","")
        df1[series] = df1[series].str.replace("-"," ")
        df1[series] = df1[series].str.replace("[^\w\s]","")
        df1[series] = df1[series].str.strip()
    df1 = df1.fillna("None")
    df1 = df1.replace("", "missing")
    return df1

def top_n(df, n, series):
    return df.sort_values(series, ascending = False).iloc[:n]

In [2]:
home = pd.read_csv("https://raw.githubusercontent.com/ayakkala1/Data301/master/crunchyroll/crunchy_home.csv")

home["similar"] = home["similar"].apply(lambda x: x.split("::"))
home["tags"] = home["tags"].apply(lambda x: x.split("::"))
home["agg_rating"] = ((home["1"] + home["2"] * 2 + home["3"] * 3 + home["4"] * 4 + home["5"] * 5)/
                        home[["1","2","3","4","5"]].sum(axis = 1))

home["name"] = home["name"].str.replace("-"," ").str.lower()

reviews = pd.read_csv("https://raw.githubusercontent.com/ayakkala1/Data301/master/crunchyroll/crunchy_review.csv")

main = pd.read_csv("https://raw.githubusercontent.com/ayakkala1/Data301/master/crunchyroll/crunchy_main.csv")

main["similar"] = main["similar"].apply(lambda x: x.split("::"))
main["tags"] = main["tags"].apply(lambda x: x.split("::"))
main["name"] = main["name"].str.replace("-"," ").str.lower()

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
home_s = home.sample(frac = .1)

In [4]:
tags = explode(home_s[["name","tags"]],"tags")

In [5]:
similar = clean_data(explode(home_s[["name","similar"]],"similar"),"similar")

In [6]:
ratings = home_s[["name","agg_review","1","2","3","4","5","agg_rating"]]

In [7]:
reviews = reviews.rename(columns = {"show":"name"})

In [8]:
reviews_s = clean_data(reviews[reviews["name"].isin(home["name"])].sample(frac = .05),
                       "datetime",
                       "review",
                       "summary",
                       "useful")

In [9]:
reviews_s = reviews_s[["name","datetime","rating","review","summary"]]

In [10]:
ratings["name"] = "'" + ratings["name"] + "'"
tags["tags"] = "'" + tags["tags"] + "'"
tags["name"] = "'" + tags["name"] + "'"
similar["similar"] = "'" + similar["similar"] + "'"
similar["name"] = "'" + similar["name"] + "'"
reviews_s["name"] = "'" + reviews_s["name"] + "'"
reviews_s["review"] = "'" + reviews_s["review"] + "'"
reviews_s["summary"] = "'" + reviews_s["summary"] + "'"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
tags.to_csv("tags.csv",index = False)
similar.to_csv("similar.csv",index = False)
ratings.to_csv("ratings.csv",index = False)
reviews_s.to_csv("reviews.csv",index = False)