In [1]:
# !pip install scikit-learn
# !pip install pandas
# !pip install emoji
# !pip install num2words
# !pip install nltk
# !pip install matplotlib
# !pip install wordcloud

In [4]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import emoji
#from num2words import num2words
import nltk
#from nltk.corpus import stopwords
#from nltk.stem import WordNetLemmatizer
#from nltk.stem import PorterStemmer
#nltk.download('stopwords')
#nltk.download('wordnet')
#import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
#from matplotlib import colors
#from wordcloud import WordCloud

In [1]:
def read_df_from_json(url):
    data_list = []

    with open(url, 'r') as file:
        for line in file:
            data = json.loads(line)
            label = ""
            if "overall" in data and "reviewText" in data and "summary" in data:
                if data["overall"] == 3.0:
                    label = "neutral"
                elif data["overall"] > 3.0:
                    label = "positive"
                elif data["overall"] < 3.0:
                    label = "negative"
                data_list.append({"label": label, "overall": data["overall"], "reviewText": data["reviewText"], "summary": data["summary"]})

        df = pd.DataFrame(data_list)
        df.drop_duplicates(subset=["reviewText"], inplace=True)
    
        return df
    return []

In [2]:
def concatenate_dfs(list_of_links):
    dfs = []

    for link in list_of_links:
        dfs.append(read_df_from_json(link))
    
    big_df = pd.concat(dfs, ignore_index=True)
    
    return big_df 

In [5]:
links = ['data\AMAZON_FASHION_5.json','data\All_Beauty_5.json','data\Luxury_Beauty_5.json','data\Clothing_Shoes_and_Jewelry_5.json']
df  = concatenate_dfs(links)

In [6]:
print("neutre:", df["label"].value_counts()["neutral"])
print("pozitive:", df["label"].value_counts()["positive"])
print("negative:", df["label"].value_counts()["negative"])

neutre: 876977
pozitive: 6520468
negative: 901322


In [26]:
def split_and_write_csv(input_df, train_size_per_class={'negative':17000, 'neutral':11000, 'positive':17000 }, test_size_per_class={'negative':2250, 'neutral':1500, 'positive':2250 }, validation_size_per_class={'negative':5000, 'neutral':3000, 'positive':5000 }):
    grouped = input_df.groupby('label')

    train_data = pd.DataFrame()
    train_labels = pd.DataFrame()
    test_data = pd.DataFrame()
    test_labels = pd.DataFrame()
    validation_data = pd.DataFrame()
    validation_labels = pd.DataFrame()
    
    train_data_plot = pd.DataFrame()
    test_data_plot = pd.DataFrame()
    validation_data_plot = pd.DataFrame()

    for group_name, group_df in grouped:
        # print(test_size_per_class[group_name])
        shuffled_group_df = group_df.sample(frac=1, random_state=42)

        train_group, test_group = train_test_split(shuffled_group_df, train_size=train_size_per_class[group_name] , test_size=test_size_per_class[group_name] + validation_size_per_class[group_name], random_state=42)
    
        test_group, validation_group = train_test_split(test_group, test_size=validation_size_per_class[group_name], random_state=42)

        train_data = pd.concat([train_data, train_group[['label','reviewText']]])
        train_labels = pd.concat([train_labels, train_group['label']])

        test_data = pd.concat([test_data, test_group['reviewText']])
        test_labels = pd.concat([test_labels, test_group['label']])
    
        validation_data = pd.concat([validation_data, validation_group['reviewText']])
        validation_labels = pd.concat([validation_labels, validation_group['label']])
    
        train_data_plot = pd.concat([train_data_plot, train_group])  
        test_data_plot = pd.concat([test_data_plot, test_group])
        validation_data_plot = pd.concat([validation_data_plot, validation_group])
        
    
    shuffled_train_data = train_data.sample(frac=1, random_state=42)
    shuffled_train_labels = train_labels.sample(frac=1, random_state=42)

    shuffled_train_data.rename(columns={0: "text"}, inplace=True)
    shuffled_train_labels.rename(columns={0: "label"}, inplace=True)
    
    # shuffled_val_data = validation_data.sample(frac=1, random_state=42)
    # shuffled_val_labels = validation_data.sample(frac=1, random_state=42)

    validation_data.rename(columns={0: "text"}, inplace=True)
    validation_labels.rename(columns={0: "label"}, inplace=True)
    
    test_data.rename(columns={0: "text"}, inplace=True)
    test_labels.rename(columns={0: "label"}, inplace=True)
    
    shuffled_train_data.to_csv('data/shuffled_train_data.csv', index=False)
    shuffled_train_labels.to_csv('data/shuffled_train_labels.csv', index=False)
    validation_data.to_csv('data/validation_data.csv', index=False)
    validation_labels.to_csv('data/validation_labels.csv', index=False)
    test_data.to_csv('data/test_data.csv', index=False)
    test_labels.to_csv('data/test_labels.csv', index=False)
    train_data_plot.to_csv('data/train_data_plot.csv', index=False)
    test_data_plot.to_csv('data/test_data_plot.csv', index=False)
    validation_data_plot.to_csv('data/validation_data_plot.csv', index=False)


In [27]:
split_and_write_csv(df)

In [19]:
train_data = pd.read_csv('data/shuffled_train_data.csv',index_col=None)
train_labels = pd.read_csv('data/shuffled_train_labels.csv',index_col=None)

test_data = pd.read_csv('data/test_data.csv',index_col=None)
test_labels = pd.read_csv('data/test_labels.csv',index_col=None)

val_data = pd.read_csv('data/validation_data.csv',index_col=None)
val_labels = pd.read_csv('data/validation_labels.csv',index_col=None)

train_data_plot = pd.read_csv('data/train_data_plot.csv',index_col=None)
test_data_plot = pd.read_csv('data/test_data_plot.csv',index_col=None)

train_data.shape

(45000, 1)