In [None]:
import pandas as pd
import json
import seaborn as sns

# READING THE JSON FILE

In [None]:
pd.read_json('Office_Products_5.json', lines=True)

In [None]:
input_file = 'Office_Products_5.json'

In [None]:
with open(input_file) as f:
    lines = f.read().splitlines()

In [None]:
len(lines)

In [None]:
import pandas as pd
df_inter = pd.DataFrame(lines)
df_inter.columns = ['json_element']

In [None]:
import json
df_inter['json_element'].apply(json.loads)

# NORMALIZING JSON LINES

In [None]:
df_final = pd.json_normalize(df_inter['json_element'].apply(json.loads))

In [None]:
df_final.head(1)

# LOOKING AT THE REVIEW TEXT

In [None]:
df_final.reviewText

In [None]:
sns.countplot(df_final.overall)

In [None]:
df_final.columns

In [None]:
df_final = df_final[df_final.verified]
df_final

# CONVERTING UNIXDATETIME TO A DATE STAMP

In [None]:
pip install DateTime

In [None]:
from datetime import datetime

condition = lambda row: datetime.fromtimestamp(row).strftime("%m-%d-%Y")
df_final["unixReviewTime"] = df_final["unixReviewTime"].apply(condition)

In [None]:
df_final.head(1)

# DROPPING reviewTime COLUMN

In [None]:
df_final.drop(columns=['reviewTime','style.Format:', 'style.Package Quantity:', 
        'style.Color:', 'style.Size:', 'style.Style:', 'style.style name:',
       'style.Design:', 'style.Pattern:', 'style.Size Name:',
       'style.Item Package Quantity:', 'style.Style Name:',
       'style.Package Type:', 'style.Color Name:', 'style.Number of Items:',
       'style.Product Packaging:', 'style.Length:', 'style.Overall Height:',
       'style.Team Name:', 'style.Overall Length:', 'style.Thickness:',
       'style.style:', 'style.Model:', 'style.Edition:', 'style.Model Number:',
       'style.Shape:', 'style.Platform:', 'style.Material Type:',
       'style.Material:', 'style.Flavor:', 'style.Gift Amount:'])

# TAKING ONLY VERIFIED "TRUE"

In [None]:
verified_count = df_final.groupby(['verified'])['reviewerID'].count().reset_index()
verified_count

# FINDING THE DATES FOR WHICH THE REVIEWS WERE COLLECTED

In [None]:
print(df_final.unixReviewTime.min())

In [None]:
print(df_final.unixReviewTime.max())

# NLP PROCESSING

In [None]:
df_final = df_final.dropna(axis=0, subset=['reviewText'])

In [None]:
df_final

In [None]:
pip install nltk

In [None]:
import re
import nltk

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet

In [None]:
pattern = r"\&\#[0-9]+\;"

df_final["preprocessed"] = df_final["reviewText"].str.replace(pat=pattern, repl="", regex=True)

print(df_final["preprocessed"]).iloc[709987]

In [None]:
resources = ["wordnet", "stopwords", "punkt", 
            "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]

for resource in resources:
    try:
        nltk.data.find("tokenizers/" + resource)
    except LookupError:
        nltk.download(resource)

lemme = WordNetLemmatizer()

def lemmatize_word(tagged_token):
    root = []
    for token in tagged_token:
        tag = token[1][0]
        word = token[0]
        if tag.startswith('J'):
            root.append(lemme.lemmatize(word, wordnet.ADJ))
        elif tag.startswith('V'):
            root.append(lemme.lemmatize(word, wordnet.VERB))
        if tag.startswith('N'):
            root.append(lemme.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('R'):
            root.append(lemme.lemmatize(word, wordnet.ADV))
        else: 
            root.append(word)
    return root

def lemmatize_doc(document):
    lemmatized_list = []
    tokenized_sent = sent_tokenize(document)
    for sentence in tokenized_sent:
        no_punctuation = re.sub(r"[`'\",.!?()]"," ", sentence)
        tokenized_word = word_tokenize(no_punctuation)
        tagged_token = pos_tag(tokenized_word)
        lemmatized = lemmatize_word(tagged_token)
        lemmatized_list.extend(lemmatized)
    return " ".join(lemmatized_list)

df_final['preprocesssed'] = df_final['preprocessed'].apply(lambda row: lemmatize_doc(row))

In [None]:
df_final