In [1]:
#Authors: Velvet Robinson, Samin Nikkhoo, Aspen Jack, Christina Zermeno, Arpita Sharma
#Project: Ethical Fashion, creating data visualizations using Fashion Revolution
#Link: https://www.fashionrevolution.org/ 
#Data downloaded from: https://wikirate.org/Fashion_Revolution+Fashion_Transparency_Index_2021
# Data we are using includes the company list, headquarters,and metrics for Decarbonization and Deforestation, 
#Towards Paying Living Wages, Overconsumption Waste Circulatory Answer, Sustainable Sourcing.

In [2]:
#importing the dependencies we will be using here
import pandas as pd
import csv 
from sqlalchemy import create_engine
import os
import numpy as np

In [3]:
#Files we will be using are here: 

company_headquarters = "Resources/Company_Headquarters.csv"
decarbonization_score = "Resources/Metrics/Decarbonisation_Deforestation_and_Regeneration_Scores.csv"
overconsumption_score = "Resources/Metrics/Overconsumption_Waste_Circularity_Scores.csv"
sustainable_materials_score = "Resources/Metrics/Sustainable_Sourcing_Materials_Scores.csv"
living_wage_score = "Resources/Metrics/Towards_Paying_Living_Wages_Scores.csv"

#now reading them in as pd dataframes
company_df = pd.read_csv(company_headquarters, header=4)
decarbonization_df = pd.read_csv(decarbonization_score, header=4)
overconsumption_df = pd.read_csv(overconsumption_score,header=4)
sustainable_materials_df = pd.read_csv(sustainable_materials_score,header=4)
living_wage_df = pd.read_csv(living_wage_score,header=4)


In [4]:
#checking all the tables are showing up correctly
#company_df
#decarbonization_df
#overconsumption_df
#sustainable_materials_df
#living_wage_df

In [5]:
#adding metric files together
concatenated_df = pd.concat([decarbonization_df, overconsumption_df, sustainable_materials_df, living_wage_df], ignore_index=True)

In [6]:
#checking merged dataset
concatenated_df
#contains 1202 rows, which combines all rows from csv files

Unnamed: 0,Answer Page,Metric,Company,Year,Value,Source Page
0,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",Gucci,2021,10.000000,
1,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",Saint Laurent,2021,10.000000,
2,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",Bottega Veneta,2021,10.000000,
3,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",Balenciaga,2021,10.000000,
4,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",H&M,2021,8.571429,
...,...,...,...,...,...,...
1197,https://wikirate.org/Fashion_Revolution+Toward...,Fashion Revolution+Towards Paying Living Wages...,T.T. Blues Jeans,2021,0.000000,
1198,https://wikirate.org/Fashion_Revolution+Toward...,Fashion Revolution+Towards Paying Living Wages...,Verochi SA de CV,2021,0.000000,
1199,https://wikirate.org/Fashion_Revolution+Toward...,Fashion Revolution+Towards Paying Living Wages...,Yale de Mexico SA de CV,2021,0.000000,
1200,https://wikirate.org/Fashion_Revolution+Toward...,Fashion Revolution+Towards Paying Living Wages...,Ilusion (Diltex SA de CV),2021,0.000000,


In [7]:
#Splitting metric to take out Fashion Revolution from text
concatenated_df[["Metric1","Metric2"]] = concatenated_df["Metric"].str.split('+',expand=True)
concatenated_df[["Metric3","Metric4"]] = concatenated_df["Metric2"].str.split('(',expand=True)


In [8]:
#Reviewing cleaned_df here:
concatenated_df

#Looks good, we will now drop metric and metric1, metric2, metrict4

Unnamed: 0,Answer Page,Metric,Company,Year,Value,Source Page,Metric1,Metric2,Metric3,Metric4
0,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",Gucci,2021,10.000000,,Fashion Revolution,"Decarbonisation, Deforestation and Regeneratio...","Decarbonisation, Deforestation and Regeneratio...",
1,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",Saint Laurent,2021,10.000000,,Fashion Revolution,"Decarbonisation, Deforestation and Regeneratio...","Decarbonisation, Deforestation and Regeneratio...",
2,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",Bottega Veneta,2021,10.000000,,Fashion Revolution,"Decarbonisation, Deforestation and Regeneratio...","Decarbonisation, Deforestation and Regeneratio...",
3,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",Balenciaga,2021,10.000000,,Fashion Revolution,"Decarbonisation, Deforestation and Regeneratio...","Decarbonisation, Deforestation and Regeneratio...",
4,https://wikirate.org/Fashion_Revolution+Decarb...,"Fashion Revolution+Decarbonisation, Deforestat...",H&M,2021,8.571429,,Fashion Revolution,"Decarbonisation, Deforestation and Regeneratio...","Decarbonisation, Deforestation and Regeneratio...",
...,...,...,...,...,...,...,...,...,...,...
1197,https://wikirate.org/Fashion_Revolution+Toward...,Fashion Revolution+Towards Paying Living Wages...,T.T. Blues Jeans,2021,0.000000,,Fashion Revolution,Towards Paying Living Wages Score (2021),Towards Paying Living Wages Score,2021)
1198,https://wikirate.org/Fashion_Revolution+Toward...,Fashion Revolution+Towards Paying Living Wages...,Verochi SA de CV,2021,0.000000,,Fashion Revolution,Towards Paying Living Wages Score (2021),Towards Paying Living Wages Score,2021)
1199,https://wikirate.org/Fashion_Revolution+Toward...,Fashion Revolution+Towards Paying Living Wages...,Yale de Mexico SA de CV,2021,0.000000,,Fashion Revolution,Towards Paying Living Wages Score (2021),Towards Paying Living Wages Score,2021)
1200,https://wikirate.org/Fashion_Revolution+Toward...,Fashion Revolution+Towards Paying Living Wages...,Ilusion (Diltex SA de CV),2021,0.000000,,Fashion Revolution,Towards Paying Living Wages Score (2021),Towards Paying Living Wages Score,2021)


In [9]:
#Dropping unnecessary variables here: 
dropped_df = concatenated_df.drop(columns =["Answer Page","Source Page", "Metric","Metric1","Metric2","Metric4"])
dropped_df

Unnamed: 0,Company,Year,Value,Metric3
0,Gucci,2021,10.000000,"Decarbonisation, Deforestation and Regeneratio..."
1,Saint Laurent,2021,10.000000,"Decarbonisation, Deforestation and Regeneratio..."
2,Bottega Veneta,2021,10.000000,"Decarbonisation, Deforestation and Regeneratio..."
3,Balenciaga,2021,10.000000,"Decarbonisation, Deforestation and Regeneratio..."
4,H&M,2021,8.571429,"Decarbonisation, Deforestation and Regeneratio..."
...,...,...,...,...
1197,T.T. Blues Jeans,2021,0.000000,Towards Paying Living Wages Score
1198,Verochi SA de CV,2021,0.000000,Towards Paying Living Wages Score
1199,Yale de Mexico SA de CV,2021,0.000000,Towards Paying Living Wages Score
1200,Ilusion (Diltex SA de CV),2021,0.000000,Towards Paying Living Wages Score


In [10]:
#Renaming Metric3 to Metric
cleaned_df = dropped_df.rename(columns={"Metric3":"Metric"})
cleaned_df
company_df

Unnamed: 0,Link,Name,ID,Headquarters
0,https://wikirate.org/~60914,Abercrombie & Fitch,60914,New York (United States)
1,https://wikirate.org/~7217,Adidas AG,7217,Germany
2,https://wikirate.org/~2612145,Aeropostale Inc.,2612145,New York (United States)
3,https://wikirate.org/~1830731,Airwair International Ltd (Dr Martens),1830731,United Kingdom
4,https://wikirate.org/~48256,Aldi Nord,48256,Germany
...,...,...,...,...
244,https://wikirate.org/~5407364,Wrangler,5407364,North Carolina (United States)
245,https://wikirate.org/~5785348,Youngor,5785348,Zhejiang (China)
246,https://wikirate.org/~2608717,Zalando SE,2608717,Germany
247,https://wikirate.org/~18215,Zara,18215,Spain


In [11]:
#Now checking if we need to clean headquarters
company_df["Headquarters"].unique()
#we do need to clean it, as there are some headquarters with state and country data, while others only have country data. 
#We will need to do this for United States, China, and Canada. 

array(['New York (United States)', 'Germany', 'United Kingdom',
       'Quebec (Canada)', 'Washington (United States)',
       'Pennsylvania (United States)', 'Fujian (China)', 'Canada',
       'Japan', 'Spain', 'Switzerland', 'California (United States)',
       'Korea, Republic of', 'Guangdong (China)', 'India', 'Australia',
       'Shanghai (China)', 'Italy', 'Nebraska (United States)',
       'New Jersey (United States)', 'Belgium', 'Ontario, Canada',
       'Michigan (United States)', 'France',
       'North Carolina (United States)', 'Florida (United States)',
       'Illinois (United States)', 'New Hampshire (United States)',
       'Oregon (United States)', 'Massachusetts (United States)',
       'Arkansas (United States)', 'Norway', 'Ohio (United States)',
       'Hong Kong', 'Chile', 'Missouri (United States)', 'Sweden',
       'South Africa', 'Texas (United States)', 'Netherlands',
       'Jiangsu (China)', 'Denmark', 'Wisconsin (United States)',
       'New Zealand', 'Beiji

In [12]:
#Creating separate country datasets here for the three countries
US_df = company_df[company_df["Headquarters"].str.contains("United States")]
CHINA_df = company_df[company_df["Headquarters"].str.contains("China")]
CAD_df = company_df[company_df["Headquarters"].str.contains("Canada")]


#Running checks to see if the datasets look okay
US_df
CHINA_df
CAD_df

#now creating a dataset for all the outher countries which currently have no issues
no_state_df = company_df[(~company_df["Headquarters"].str.contains("Canada"))& \
                         (~company_df["Headquarters"].str.contains("United States"))& \
                         (~company_df["Headquarters"].str.contains("China"))]
no_state_df


Unnamed: 0,Link,Name,ID,Headquarters
1,https://wikirate.org/~7217,Adidas AG,7217,Germany
3,https://wikirate.org/~1830731,Airwair International Ltd (Dr Martens),1830731,United Kingdom
4,https://wikirate.org/~48256,Aldi Nord,48256,Germany
5,https://wikirate.org/~5208,Aldi Sud,5208,Germany
12,https://wikirate.org/~170564,Asics Corporation,170564,Japan
...,...,...,...,...
242,https://wikirate.org/~130172,Warehouse Group,130172,New Zealand
243,https://wikirate.org/~56864,Woolworths Holdings Limited,56864,South Africa
246,https://wikirate.org/~2608717,Zalando SE,2608717,Germany
247,https://wikirate.org/~18215,Zara,18215,Spain


In [13]:
#Creating State and Country columns in the no_state_df so that it's easier to add with the rest of the files later
no_state_df.insert(3,"State","")
column_to_set_null = "State"
no_state_df.loc[no_state_df["State"] == "", column_to_set_null] = np.nan
no_state_df["Country"] = no_state_df["Headquarters"]
no_state_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_state_df["Country"] = no_state_df["Headquarters"]


Unnamed: 0,Link,Name,ID,State,Headquarters,Country
1,https://wikirate.org/~7217,Adidas AG,7217,,Germany,Germany
3,https://wikirate.org/~1830731,Airwair International Ltd (Dr Martens),1830731,,United Kingdom,United Kingdom
4,https://wikirate.org/~48256,Aldi Nord,48256,,Germany,Germany
5,https://wikirate.org/~5208,Aldi Sud,5208,,Germany,Germany
12,https://wikirate.org/~170564,Asics Corporation,170564,,Japan,Japan
...,...,...,...,...,...,...
242,https://wikirate.org/~130172,Warehouse Group,130172,,New Zealand,New Zealand
243,https://wikirate.org/~56864,Woolworths Holdings Limited,56864,,South Africa,South Africa
246,https://wikirate.org/~2608717,Zalando SE,2608717,,Germany,Germany
247,https://wikirate.org/~18215,Zara,18215,,Spain,Spain


In [14]:
#Splitting headquarters so that city and country show up in separate columns

#United States
US_df[["State","Country"]] = US_df["Headquarters"].str.split('(',expand=True)
US_df['Country'] = US_df['Country'].str.rstrip(')')

#China
CHINA_df[["State","Country"]] = company_df["Headquarters"].str.split('(',expand=True)
CHINA_df['Country'] = CHINA_df['Country'].str.rstrip(')')

US_df
CHINA_df

# Remove the closing parentheses from the 'Country' column
#company_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  US_df[["State","Country"]] = US_df["Headquarters"].str.split('(',expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  US_df[["State","Country"]] = US_df["Headquarters"].str.split('(',expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  US_df['Country'] = US_df['Country'].str.rstrip(')

Unnamed: 0,Link,Name,ID,Headquarters,State,Country
10,https://wikirate.org/~5424735,Anta Sports Products,5424735,Fujian (China),Fujian,China
19,https://wikirate.org/~49737,Belle International Holdings,49737,Guangdong (China),Guangdong,China
27,https://wikirate.org/~5294189,Bosideng International Holdings Limited,5294189,Shanghai (China),Shanghai,China
103,https://wikirate.org/~2631045,Heilan Home,2631045,Jiangsu (China),Jiangsu,China
136,https://wikirate.org/~1173485,Li-Ning,1173485,Beijing (China),Beijing,China
154,https://wikirate.org/~3624503,Metersbonwe,3624503,Shanghai (China),Shanghai,China
245,https://wikirate.org/~5785348,Youngor,5785348,Zhejiang (China),Zhejiang,China


In [15]:
#Canada
#CAD_df[["State","Country"]] = CAD_df["Headquarters"].str.split('(',expand=True)


# Split on commas and parentheses, and expand into separate columns
split_columns = CAD_df['Headquarters'].str.split(r'[,(]+', expand=True)


# Rename the columns to 'State' and 'Country'
split_columns.columns = ['State', 'Country']

# Assign the new columns to the original DataFrame
CAD_df[['State', 'Country']] = split_columns
CAD_df['Country'] = CAD_df['Country'].str.rstrip(')')

CAD_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CAD_df[['State', 'Country']] = split_columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CAD_df[['State', 'Country']] = split_columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CAD_df['Country'] = CAD_df['Country'].str.rstrip(')')


Unnamed: 0,Link,Name,ID,Headquarters,State,Country
6,https://wikirate.org/~5793646,ALDO,5793646,Quebec (Canada),Quebec,Canada
11,https://wikirate.org/~5414341,Aritzia,5414341,Canada,Canada,
39,https://wikirate.org/~5408890,Canada Goose,5408890,"Ontario, Canada",Ontario,Canada
98,https://wikirate.org/~116138,Gildan Activewear Inc.,116138,Canada,Canada,
108,https://wikirate.org/~3098198,Hudson's Bay Company,3098198,"Ontario, Canada",Ontario,Canada
119,https://wikirate.org/~44824,Joe Fresh,44824,"Ontario, Canada",Ontario,Canada
143,https://wikirate.org/~1825510,lululemon athletica,1825510,British Columbia (Canada),British Columbia,Canada


In [16]:
column_to_set_null = "State"
CAD_df.loc[CAD_df["State"] == "Canada", column_to_set_null] = np.nan
CAD_df["Country"] ="Canada"

CAD_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CAD_df["Country"] ="Canada"


Unnamed: 0,Link,Name,ID,Headquarters,State,Country
6,https://wikirate.org/~5793646,ALDO,5793646,Quebec (Canada),Quebec,Canada
11,https://wikirate.org/~5414341,Aritzia,5414341,Canada,,Canada
39,https://wikirate.org/~5408890,Canada Goose,5408890,"Ontario, Canada",Ontario,Canada
98,https://wikirate.org/~116138,Gildan Activewear Inc.,116138,Canada,,Canada
108,https://wikirate.org/~3098198,Hudson's Bay Company,3098198,"Ontario, Canada",Ontario,Canada
119,https://wikirate.org/~44824,Joe Fresh,44824,"Ontario, Canada",Ontario,Canada
143,https://wikirate.org/~1825510,lululemon athletica,1825510,British Columbia (Canada),British Columbia,Canada


In [17]:
country_concat_df = pd.concat([US_df, CHINA_df, CAD_df, no_state_df], ignore_index=True)
country_concat_df

Unnamed: 0,Link,Name,ID,Headquarters,State,Country
0,https://wikirate.org/~60914,Abercrombie & Fitch,60914,New York (United States),New York,United States
1,https://wikirate.org/~2612145,Aeropostale Inc.,2612145,New York (United States),New York,United States
2,https://wikirate.org/~11708,"Amazon.com, Inc.",11708,Washington (United States),Washington,United States
3,https://wikirate.org/~862441,American Eagle Outfitters,862441,Pennsylvania (United States),Pennsylvania,United States
4,https://wikirate.org/~5455028,Anthropologie,5455028,Pennsylvania (United States),Pennsylvania,United States
...,...,...,...,...,...,...
244,https://wikirate.org/~130172,Warehouse Group,130172,New Zealand,,New Zealand
245,https://wikirate.org/~56864,Woolworths Holdings Limited,56864,South Africa,,South Africa
246,https://wikirate.org/~2608717,Zalando SE,2608717,Germany,,Germany
247,https://wikirate.org/~18215,Zara,18215,Spain,,Spain


In [18]:
#Drop variables for company dataset here:
company_drop_df = country_concat_df.drop(columns=["Link", "Headquarters"])
company_drop_df

Unnamed: 0,Name,ID,State,Country
0,Abercrombie & Fitch,60914,New York,United States
1,Aeropostale Inc.,2612145,New York,United States
2,"Amazon.com, Inc.",11708,Washington,United States
3,American Eagle Outfitters,862441,Pennsylvania,United States
4,Anthropologie,5455028,Pennsylvania,United States
...,...,...,...,...
244,Warehouse Group,130172,,New Zealand
245,Woolworths Holdings Limited,56864,,South Africa
246,Zalando SE,2608717,,Germany
247,Zara,18215,,Spain


In [19]:
#rename name to Company
company_cleaned_df = company_drop_df.rename(columns={"Name":"Company"})

company_cleaned_df

Unnamed: 0,Company,ID,State,Country
0,Abercrombie & Fitch,60914,New York,United States
1,Aeropostale Inc.,2612145,New York,United States
2,"Amazon.com, Inc.",11708,Washington,United States
3,American Eagle Outfitters,862441,Pennsylvania,United States
4,Anthropologie,5455028,Pennsylvania,United States
...,...,...,...,...
244,Warehouse Group,130172,,New Zealand
245,Woolworths Holdings Limited,56864,,South Africa
246,Zalando SE,2608717,,Germany
247,Zara,18215,,Spain


In [20]:
combined_df = pd.merge(company_cleaned_df,cleaned_df, on="Company", how='inner')
combined_df

Unnamed: 0,Company,ID,State,Country,Year,Value,Metric
0,Abercrombie & Fitch,60914,New York,United States,2021,0.000000,"Decarbonisation, Deforestation and Regeneratio..."
1,Abercrombie & Fitch,60914,New York,United States,2021,1.818182,"5.4 Overconsumption, Waste & Circularity"
2,Abercrombie & Fitch,60914,New York,United States,2021,4.444444,5.3 Sustainable Sourcing & Materials
3,Abercrombie & Fitch,60914,New York,United States,2021,1.111111,Towards Paying Living Wages Score
4,Aeropostale Inc.,2612145,New York,United States,2021,0.000000,"Decarbonisation, Deforestation and Regeneratio..."
...,...,...,...,...,...,...,...
947,Zara,18215,,Spain,2021,1.111111,Towards Paying Living Wages Score
948,Zeeman,3096058,,Netherlands,2021,1.428571,"Decarbonisation, Deforestation and Regeneratio..."
949,Zeeman,3096058,,Netherlands,2021,0.000000,"5.4 Overconsumption, Waste & Circularity"
950,Zeeman,3096058,,Netherlands,2021,4.444444,5.3 Sustainable Sourcing & Materials


In [None]:
#checking on whether the number of rows match in the final dataset
len(cleaned_df)
cleaned_df["Company"].nunique()
combined_df["Company"].nunique()
company_cleaned_df["Company"].nunique()
list(company_cleaned_df["Company"])

In [None]:
list(cleaned_df["Company"].unique())

In [None]:
cleaned_df[cleaned_df['Company'].str.contains('"')]


In [None]:
list(cleaned_df["Company"].unique())

In [None]:
list(company_cleaned_df["Company"].unique())

In [None]:
company_cleaned_df[company_cleaned_df["Company"]=="Ontario"]

In [None]:
company_df[company_df["Name"]=="Ontario"]