## Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
# to work with json
import json

#visualization libraries
import matplotlib.pylab as plt
import seaborn as sns

#stile for plots
plt.style.use('ggplot')
#pd.set_option('max_columns', 200)

In [2]:
# to import the data from the json file to a dataframe
with open ('/home/bbruno/all_here/python course/vinnie/data/users-data-prod.json', 'r') as file:
    data = json.load(file)

df = pd.json_normalize(data['usersData'])
df.columns

Index(['userId', 'wines', 'tastingNotes', 'pairings'], dtype='object')

In [3]:
df.head()

Unnamed: 0,userId,wines,tastingNotes,pairings
0,131a3994-a510-4ead-93d0-3d89f06b9602,[],[],"[{'id': '4tO1hMInfEtHv4IbyIg9', 'date': '2022-..."
1,16e6321b-1dd8-4db0-a491-627055a20d1d,[],[],"[{'id': '4nDez8hvWjj4UscxL3Op', 'date': '2023-..."
2,bffc7c74-ccf9-429b-a6a5-d7bc8a53441e,[],[],"[{'id': '0eaezCUA4haEf2GVMSCo', 'date': '2023-..."
3,97e19ee8-5d4a-4691-bdf0-a20f284d9f0a,[],[],"[{'id': 'A2AF5ggyi9GtyZf1x9LG', 'date': '2023-..."
4,690e6b1c-8af0-4a8b-b4a8-44030d7c7744,"[{'grapes': ['Cabernet Sauvignon', 'Merlot', '...",[],"[{'id': 'yNiSBgN9jZ9b1aVz2pND', 'date': '2021-..."


In [4]:
df.shape

(4504, 4)

In [5]:
data['usersData']

[{'userId': '131a3994-a510-4ead-93d0-3d89f06b9602',
  'wines': [],
  'tastingNotes': [],
  'pairings': [{'id': '4tO1hMInfEtHv4IbyIg9',
    'date': '2022-01-30T10:42:30.715Z',
    'feedbackOnPairing': 'neutral',
    'selected': 'sangiovese_red',
    'suggestionsList': ['sangiovese_red',
     'grenache_red',
     'nebbiolo_red',
     'gewurztraminer_white',
     'prosecco_white']},
   {'id': '98zJC8sIv4mxZQxediVF',
    'date': '2021-12-02T14:11:01.208Z',
    'feedbackOnPairing': 'neutral',
    'selected': 'carmenere_red',
    'suggestionsList': ['carmenere_red',
     'cabernet_franc_red',
     'petit_verdot_red',
     'marsanne_white',
     'riesling_white']},
   {'id': 'A3F4uprwpFswMrreRwup',
    'date': '2021-12-02T14:14:44.861Z',
    'feedbackOnPairing': 'neutral',
    'selected': 'riesling_white',
    'suggestionsList': ['riesling_white',
     'pinot_blanc_white',
     'sauvignon_blanc_white',
     'gruner_veltliner_white',
     'cabernet_franc_red']},
   {'id': 'HNbre3Fb8Y1P7zlzR6iY

In [6]:
# to check the data types
df.dtypes

userId          object
wines           object
tastingNotes    object
pairings        object
dtype: object

## feedback on pairings

In [7]:
feedback_data = []
for user_data in data['usersData']:
    for pairing in user_data['pairings']:
        feedback_data.append({
            'userId': user_data['userId'],
            #'pairingId': pairing['id'], # not needed
            #'date': pairing['date'], # not needed
            
            'feedback': pairing['feedbackOnPairing'],
            'wine': pairing['selected']
        })

feedback_df = pd.DataFrame(feedback_data)
###############################

#convert "feedback" from string to int
feedback_mapping = {'neutral': 0, 'like': 1, 'dislike': -1}
feedback_df['feedback'] = feedback_df['feedback'].map(feedback_mapping)

# Fill any missing values with a default value (0 in this case)
feedback_df['feedback'] = feedback_df['feedback'].fillna(0)

# print the unique values in the feedback column
print("the values after mapping are:", feedback_df['feedback'].unique())

# Converting the column to integers
feedback_df = feedback_df.astype({'feedback': 'int'})

print(feedback_df.dtypes)
feedback_df


the values after mapping are: [ 0 -1  1]
userId      object
feedback     int64
wine        object
dtype: object


Unnamed: 0,userId,feedback,wine
0,131a3994-a510-4ead-93d0-3d89f06b9602,0,sangiovese_red
1,131a3994-a510-4ead-93d0-3d89f06b9602,0,carmenere_red
2,131a3994-a510-4ead-93d0-3d89f06b9602,0,riesling_white
3,131a3994-a510-4ead-93d0-3d89f06b9602,0,chardonnay_white
4,131a3994-a510-4ead-93d0-3d89f06b9602,0,nebbiolo_red
...,...,...,...
10116,5df09a59-c0e4-4c83-8e32-007b39edce53,0,nero_davola_red
10117,5df09a59-c0e4-4c83-8e32-007b39edce53,0,sangiovese_red
10118,6caf1bcb-095c-4b1c-a412-c3b6860949b0,0,merlot_red
10119,6caf1bcb-095c-4b1c-a412-c3b6860949b0,0,champagne_white


In [8]:
# Feedback from pairings
print("the number of total feedback is:", feedback_df.shape[0])
print("the number of neutral feedback is:", feedback_df[feedback_df['feedback'] == 0].shape[0])
print("the number of positive feedback is:", feedback_df[feedback_df['feedback'] == 1].shape[0])
print("the number of negative feedback is:", feedback_df[feedback_df['feedback'] == -1].shape[0])


the number of total feedback is: 10121
the number of neutral feedback is: 9811
the number of positive feedback is: 246
the number of negative feedback is: 64


In [9]:

# positive_feedback_users = feedback_df[feedback_df['feedback'] == 'like']['userId']
# positive_feedback_users_list = positive_feedback_users.to_list()
# print(positive_feedback_users_list)
# print(len(positive_feedback_users_list))

In [10]:
# negative_feedback_users = feedback_df[feedback_df['feedback'] == 'dislike']['userId']
# negative_feedback_users_list = negative_feedback_users.to_list()
# print(negative_feedback_users_list)
# print(len(negative_feedback_users_list))

In [11]:
# neutral_feedback_users = feedback_df[feedback_df['feedback'] == 'neutral']['userId']
# neutral_feedback_users_list = neutral_feedback_users.to_list()
# print(neutral_feedback_users_list)
# print(len(neutral_feedback_users_list))