In [11]:
import pandas as pd

social_media_df = pd.read_csv('social_media_interactions_cleaned.csv')
transactions_df = pd.read_csv('customer_transactions_cleaned.csv')
demographics_df = pd.read_csv('customer_demographics_cleaned.csv')

# Function to calculate and display descriptive statistics
def calculate_descriptive_statistics(df, df_name):
    print(f"\nDescriptive Statistics for {df_name}:")
    descriptive_stats = df.describe(include='all')
    print(descriptive_stats)
    return descriptive_stats

# Calculate descriptive statistics for the transactions DataFrame
transactions_describe = calculate_descriptive_statistics(transactions_df, "Transactions DataFrame")

# Calculate descriptive statistics for the demographics DataFrame
demographics_describe = calculate_descriptive_statistics(demographics_df, "Demographics DataFrame")

# Analyze the 'Amount' column in the Transactions DataFrame
if 'Amount' in transactions_df.columns:
    print("\nDescriptive Statistics for Amount Column:")
    amount_description = transactions_df['Amount'].describe()
    print(amount_description)

    # Interpretation of Amount Column
    mean_amount = amount_description['mean']
    median_amount = amount_description['50%']
    std_dev_amount = amount_description['std']
    print("\nInterpretation of Amount Column:")
    print(f"- Mean Amount: {mean_amount:.2f}")
    print(f"- Median Amount: {median_amount:.2f}")
    print(f"- Standard Deviation of Amount: {std_dev_amount:.2f}")

# Analyze the 'IncomeLevel' column in the Demographics DataFrame
if 'IncomeLevel' in demographics_df.columns:
    print("\nIncome Level Counts:")
    income_counts = demographics_df['IncomeLevel'].value_counts()
    print(income_counts)

    # Interpretation of Income Level
    print("\nInterpretation of Income Level:")
    for level, count in income_counts.items():
        print(f"- {level}: {count} customers")


Descriptive Statistics for Transactions DataFrame:
                                  CustomerID  TransactionID TransactionDate  \
count                                   3015         3015.0            2917   
unique                                  1871            NaN             716   
top     e54d1219-b6b3-4b5c-8b75-62f4148c23b6            NaN      2022-10-31   
freq                                       6            NaN              11   
mean                                     NaN            0.0             NaN   
std                                      NaN            0.0             NaN   
min                                      NaN            0.0             NaN   
25%                                      NaN            0.0             NaN   
50%                                      NaN            0.0             NaN   
75%                                      NaN            0.0             NaN   
max                                      NaN            0.0             NaN   


In [5]:
import pandas as pd

demographics_df = pd.read_csv('customer_demographics_cleaned.csv')

# Function to clean invalid age entries
def clean_age(df):
    print("Age column before cleaning:")
    print(df['Age'].describe())

    # Define age limits
    min_age = 1  # Minimum valid age
    max_age = 120  # Maximum valid age

    # Set invalid ages (negative and zero) to NaN
    df['Age'] = df['Age'].where((df['Age'] >= min_age) & (df['Age'] <= max_age), other=pd.NA)

    print("\nAge column after cleaning:")
    print(df['Age'].describe())

    return df

# Clean the Age column in the demographics DataFrame
demographics_df = clean_age(demographics_df)

# Save the cleaned demographics DataFrame back to CSV
demographics_df.to_csv('customer_demographics_cleaned.csv', index=False)

Age column before cleaning:
count    3023.000000
mean       40.763811
std        23.223352
min        -1.000000
25%        26.000000
50%        42.000000
75%        57.000000
max       150.000000
Name: Age, dtype: float64

Age column after cleaning:
count    2650.000000
mean       44.644906
std        15.405637
min        18.000000
25%        31.000000
50%        45.000000
75%        58.000000
max        70.000000
Name: Age, dtype: float64


In [9]:
import pandas as pd
import numpy as np

# Load your datasets
social_media_df = pd.read_csv('social_media_interactions_cleaned.csv')
transactions_df = pd.read_csv('customer_transactions_cleaned.csv')
demographics_df = pd.read_csv('customer_demographics_cleaned.csv')

# Replace "Unknown" with NaN in the IncomeLevel column
demographics_df['IncomeLevel'].replace('Unknown', np.nan, inplace=True)

# Check if the replacement was successful
print(demographics_df['IncomeLevel'].value_counts(dropna=False))

# Proceed with further analysis or export the cleaned data
demographics_df.to_csv('customer_demographics_cleaned.csv', index=False)

IncomeLevel
High      946
Low       923
Medium    869
NaN       285
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  demographics_df['IncomeLevel'].replace('Unknown', np.nan, inplace=True)


In [13]:
import pandas as pd

# Load the cleaned datasets
social_media_df = pd.read_csv('social_media_interactions_cleaned.csv')
transactions_df = pd.read_csv('customer_transactions_cleaned.csv')
demographics_df = pd.read_csv('customer_demographics_cleaned.csv')

# Get the number of rows and columns for each dataset
social_media_shape = social_media_df.shape
transactions_shape = transactions_df.shape
demographics_shape = demographics_df.shape

print(f'Social Media Interactions: {social_media_shape[0]} rows and {social_media_shape[1]} columns')
print(f'Customer Transactions: {transactions_shape[0]} rows and {transactions_shape[1]} columns')
print(f'Customer Demographics: {demographics_shape[0]} rows and {demographics_shape[1]} columns')

Social Media Interactions: 3020 rows and 6 columns
Customer Transactions: 3015 rows and 6 columns
Customer Demographics: 3023 rows and 6 columns
