# Install Required Libraries

In [1]:
!pip install --upgrade google-cloud-storage
!pip install pandas

Collecting google-cloud-storage
  Downloading google_cloud_storage-3.1.0-py2.py3-none-any.whl.metadata (12 kB)
Downloading google_cloud_storage-3.1.0-py2.py3-none-any.whl (174 kB)
Installing collected packages: google-cloud-storage
  Attempting uninstall: google-cloud-storage
    Found existing installation: google-cloud-storage 2.19.0
    Uninstalling google-cloud-storage-2.19.0:
      Successfully uninstalled google-cloud-storage-2.19.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-aiplatform 1.86.0 requires google-cloud-storage<3.0.0,>=1.32.0, but you have google-cloud-storage 3.1.0 which is incompatible.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 3.1.0 which is incompatible.[0m[31m
[0mSuccessfully installed google-cloud-storage-3.1.0


# Import Libraries

In [2]:
import pandas as pd
from google.cloud import storage
from io import StringIO

# Set Up GCS Access

In [11]:
BUCKET_NAME = '018320627-bucket'
FILE_NAME = 'Customers.csv'

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(FILE_NAME)

# Read content as string
data = blob.download_as_text()
df = pd.read_csv(StringIO(data), sep=';')
df.head()

# print(data)

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website,Age,Annual Income (k$),Spending Score (1-100)
0,1,EB54EF1154C3A78,Heather,Callahan,Mosley-David,Lake Jeffborough,Norway,043-797-5229,915.112.1727,urangel@espinoza-francis.net,2020-08-26,http://www.escobar.org/,19,15,39
1,2,10dAcafEBbA5FcA,Kristina,Ferrell,"Horn, Shepard and Watson",Aaronville,Andorra,932-062-1802,(209)172-7124x3651,xreese@hall-donovan.com,2020-04-27,https://tyler-pugh.info/,21,15,81
2,3,67DAB15Ebe4BE4a,Briana,Andersen,Irwin-Oneal,East Jordan,Nepal,8352752061,(567)135-1918,haleybraun@blevins-sexton.com,2022-03-22,https://www.mack-bell.net/,20,16,6
3,4,6d350C5E5eDB4EE,Patty,Ponce,Richardson Group,East Kristintown,Northern Mariana Islands,302.398.3833,196-189-7767x770,hohailey@anthony.com,2020-07-02,https://delacruz-freeman.org/,23,16,77
4,5,5820deAdCF23EFe,Kathleen,Mccormick,Carson-Burch,Andresmouth,Macao,001-184-153-9683x1497,552.051.2979x342,alvaradojesse@rangel-shields.com,2021-01-17,https://welch.info/,31,17,40


# Do Some Basic Analysis

In [18]:
# Basic info
df.info()

# Summary statistics
df.describe()

# Distribution of genders (if exists)
if 'Gender' in df.columns:
    print(df['Gender'].value_counts())

# Average income (if column exists)
if 'Annual Income (k$)' in df.columns:
    print("Average income:", df['Annual Income (k$)'].mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Index                   10000 non-null  int64 
 1   Customer Id             10000 non-null  object
 2   First Name              10000 non-null  object
 3   Last Name               10000 non-null  object
 4   Company                 10000 non-null  object
 5   City                    10000 non-null  object
 6   Country                 10000 non-null  object
 7   Phone 1                 10000 non-null  object
 8   Phone 2                 10000 non-null  object
 9   Email                   10000 non-null  object
 10  Subscription Date       10000 non-null  object
 11  Website                 10000 non-null  object
 12  Age                     10000 non-null  int64 
 13  Annual Income (k$)      10000 non-null  int64 
 14  Spending Score (1-100)  10000 non-null  int64 
dtypes: 

# Answers to questions

In [21]:
print("Average income:", df['Annual Income (k$)'].mean())
print("Average age:", df['Age'].mean())

Average income: 60.56
Average age: 38.85


In [24]:
print(df.columns)
if not "Gender" in df.columns:
    print("'Gender' column does not exists") 

Index(['Index', 'Customer Id', 'First Name', 'Last Name', 'Company', 'City',
       'Country', 'Phone 1', 'Phone 2', 'Email', 'Subscription Date',
       'Website', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)'],
      dtype='object')
'Gender' column does not exists


In [28]:
# Count of missing values per column
missing_values = df.isnull().sum()

# Column with the most missing values
column_with_most_missing = missing_values.idxmax()
max_missing_count = missing_values.max()

print(missing_values)
print(column_with_most_missing)
print(max_missing_count)

Index                     0
Customer Id               0
First Name                0
Last Name                 0
Company                   0
City                      0
Country                   0
Phone 1                   0
Phone 2                   0
Email                     0
Subscription Date         0
Website                   0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64
Index
0
