-
Notifications
You must be signed in to change notification settings - Fork 0
/
5270 - Python Project - Aishwarya Gupta, Rahul Bhogale.py
312 lines (188 loc) · 7.7 KB
/
5270 - Python Project - Aishwarya Gupta, Rahul Bhogale.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
# In[2]:
df =pd.read_excel("C:\\Users\\agupta25\\Desktop\\5270\\Python project\\Adidas.xlsx")
df.head()
# In[3]:
###Cleaning dataset
# In[4]:
# 1.fill na with mean values for price per unit columns
mean_price = df['Price per Unit'].mean() # Calculate the mean of the 'Price per Unit' column
df['Price per Unit'].fillna(mean_price, inplace=True) # Fill missing values with the mean value
df.head()
# In[5]:
# 2.replace 0 values with formula and formatting into int
df['Total Sales'] = (df['Price per Unit'] * df['Units Sold']).astype(int)
df['Operating Profit'] = (df['Operating Margin'] * df['Total Sales']).astype(int)
df.head()
# In[6]:
# 3.split the columns
df['Gender'] = df['Product'].str.split("'s").str[0]
df.head()
# In[7]:
# 4.Data values formatting
df['Invoice Date'] = pd.to_datetime(df['Invoice Date']) # Convert 'Invoice Date' to datetime format
df['Year'] = df['Invoice Date'].dt.year # Extract the year
df.head()
# In[8]:
# 5. Trim spaces
#before trimming
df['length of retailer'] = df['Retailer'].str.len()
df.head()
# In[9]:
#After trimming spaces
df['Retailer1'] = df['Retailer'].str.strip()
df['length of retailer1'] = df['Retailer1'].str.len()
df.head()
# In[10]:
# Statistical information
df.describe()
# In[11]:
#statistical func 1: col: Price per Unit
from scipy import stats
# Filter dataframe for New York state
ny_df = df[df['State'] == 'New York']
# Group by state and calculate the descriptive statistics for 'Price per Unit'
grouped_stats = ny_df.groupby('State')['Price per Unit'].describe()
# Print the descriptive statistics
print(grouped_stats)
# Calculate mode for 'Price per Unit'
mode = stats.mode(ny_df['Price per Unit'])
print("Mode:", mode.mode[0])
# Calculate median for 'Price per Unit'
median = ny_df['Price per Unit'].median()
print("Median:", median)
# In[12]:
#statistical func 2:
# Filter dataframe for Sales Method = Online
online_df = df[df['Sales Method'] == 'Online']
# Calculate operating margin
operating_margin = online_df['Operating Margin']
# Describe operating margin
operating_margin_stats = operating_margin.describe()
print("Operating Margin Description:")
print(operating_margin_stats)
# Calculate mode for operating margin
operating_margin_mode = stats.mode(operating_margin)
print("Operating Margin Mode:", operating_margin_mode.mode[0])
# Calculate median for operating margin
operating_margin_median = operating_margin.median()
print("Operating Margin Median:", operating_margin_median)
# In[13]:
#statistical func 3:
# Filter dataframe for Year = 2021
df_2021 = df[df['Year'] == 2021]
# Calculate total sales for 2021
total_sales_2021 = df_2021['Total Sales']
# Describe total sales for 2021
total_sales_stats = total_sales_2021.describe()
print("Total Sales Description for 2021:")
print(total_sales_stats)
# Calculate mode for total sales in 2021
total_sales_mode = stats.mode(total_sales_2021)
print("Total Sales Mode for 2021:", total_sales_mode.mode[0])
# Calculate median for total sales in 2021
total_sales_median = total_sales_2021.median()
print("Total Sales Median for 2021:", total_sales_median)
# In[14]:
###Graphs
# In[15]:
#Which region has the highest revenue and operating profit, and which region has the lowest?
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming you have a DataFrame named df with the necessary columns
# Filter the data to exclude any missing values in Total Sales and Operating Profit columns
filtered_df = df.dropna(subset=['Total Sales', 'Operating Profit'])
# Calculate the sum of Total Sales and Operating Profit by Region
revenue_profit_by_region = filtered_df.groupby('Region').agg({'Total Sales': 'sum', 'Operating Profit': 'sum'}).reset_index()
# Sort the data by Total Sales and Operating Profit in descending order
revenue_profit_by_region = revenue_profit_by_region.sort_values(by=['Total Sales', 'Operating Profit'], ascending=False)
# Create bar plots using Seaborn
fig, axes = plt.subplots(2, 1, figsize=(10, 12))
# Total Sales by Region
sns.barplot(data=revenue_profit_by_region, x='Total Sales', y='Region', color='blue', ax=axes[0])
axes[0].set_xlabel('Total Sales')
axes[0].set_ylabel('Region')
axes[0].set_title('Total Sales by Region')
# Operating Profit by Region
sns.barplot(data=revenue_profit_by_region, x='Operating Profit', y='Region', color='green', ax=axes[1])
axes[1].set_xlabel('Operating Profit')
axes[1].set_ylabel('Region')
axes[1].set_title('Operating Profit by Region')
# Region with highest revenue and operating profit
highest_revenue_region = revenue_profit_by_region.iloc[0]['Region']
highest_profit_region = revenue_profit_by_region.iloc[0]['Region']
# Region with lowest revenue and operating profit
lowest_revenue_region = revenue_profit_by_region.iloc[-1]['Region']
lowest_profit_region = revenue_profit_by_region.iloc[-1]['Region']
plt.tight_layout()
plt.show()
print('Region with highest revenue:', highest_revenue_region)
print('Region with highest operating profit:', highest_profit_region)
print('Region with lowest revenue:', lowest_revenue_region)
print('Region with lowest operating profit:', lowest_profit_region)
# In[16]:
#How does the number of units sold vary across different products?
import pandas as pd
import plotly.express as px
# Assuming you have a DataFrame named df with the necessary columns
fig = px.pie(df, values='Units Sold', names='Product', title='Units Sold by Product')
fig.show()
# In[17]:
# How does the relationship between total sales and operating profit differ between men and women?
# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Filtered DataFrame for men and women
df_men = df[df['Gender'] == 'Men']
df_women = df[df['Gender'] == 'Women']
# Set the style of the plots
sns.set(style="darkgrid")
# Plot for men (left subplot)
sns.scatterplot(data=df_men, x='Total Sales', y='Operating Profit', ax=axes[0], color='blue')
axes[0].set_title('Men')
axes[0].set_xlabel('Sales')
axes[0].set_ylabel('Margin')
# Plot for women (right subplot)
sns.scatterplot(data=df_women, x='Total Sales', y='Operating Profit', ax=axes[1], color='orange')
axes[1].set_title('Women')
axes[1].set_xlabel('Sales')
axes[1].set_ylabel('Margin')
# Adjust the spacing between subplots
plt.tight_layout()
# Display the plot
plt.show()
# In[18]:
#How does the price per unit affect the number of units sold and total sales?
# Set the style of the plot
sns.set(style="darkgrid")
# Create the bubble chart
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Price per Unit', y='Units Sold', size='Total Sales', sizes=(30, 300),
alpha=0.7, edgecolor='black')
# Set the title and labels for the plot
plt.title('Price per Unit vs Number of Units Sold (Bubble Size: Total Sales)')
plt.xlabel('Price per Unit')
plt.ylabel('Number of Units Sold')
# Display the plot
plt.show()
# In[19]:
#What is the total revenue generated by the retailer during a specific period?
# Filter the DataFrame for retailer='Amazon' and year='2021'
filtered_df = df[(df['Retailer'] == 'Amazon') & (df['Year'] == 2021)]
df1 = filtered_df[['Retailer', 'Year', 'Region', 'Product', 'Total Sales']]
# Group the filtered data by product and region and calculate the average total sales
average_sales = df1.groupby(['Region', 'Product'])['Total Sales'].mean().reset_index()
# Create the pivot table
pivot_table = average_sales.pivot(index='Region', columns='Product', values='Total Sales')
# Create the heatmap using Seaborn
plt.figure(figsize=(10, 6))
sns.heatmap(data=pivot_table, annot=True, cmap='YlGnBu', fmt='.2f', linewidths=0.5)
plt.title('Average Total Sales by Product and Region')
plt.xlabel('Product')
plt.ylabel('Region')
plt.show()
# In[ ]: