In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# List of file paths
file_paths = [
    '../data/umsatzdaten_gekuerzt.csv',
    '../data/wetter.csv',
    '../data/kiwo.csv'
]

# Use a list comprehension to read the CSV files into DataFrames
dataframes = [pd.read_csv(file) for file in file_paths]

# Optionally, you can assign them to specific variable names
df_umsatz, df_wetter, df_kiwo = dataframes

In [None]:
df.head()
df.tail()

# Convert 'Datum' to datetime format
df['Datum'] = pd.to_datetime(df['Datum'])

# Calculate descriptive statistics for numerical columns
descriptive_stats = df.describe()

print(descriptive_stats)


In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Create subplots
num_columns = len(numeric_columns)
fig, axes = plt.subplots(nrows=1, ncols=num_columns, figsize=(12, 6), sharey=False)

# Create a violin plot for each numeric column
for ax, column in zip(axes, numeric_columns):
    sns.violinplot(data=df[column], ax=ax, inner='quartile')
    ax.set_title(column)
    # ax.set_yscale('log')  # Set Y-axis to logarithmic scale
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()


In [None]:
# Check for NaN
for column in df.columns:
    nan_count = df[column].isna().sum()
    print(f"Column '{column}' has {nan_count} NaN values.")

# infinite values
import numpy as np
print("Number of infinite values in 'Umsatz':", np.isinf(df['Umsatz']).sum())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming df is already defined and contains the necessary columns

# Check for non-finite values in 'Umsatz'
print("Number of NaN values in 'Umsatz':", df['Umsatz'].isna().sum())
print("Number of infinite values in 'Umsatz':", np.isinf(df['Umsatz']).sum())

# Clean the data by dropping NaN and infinite values
df = df.dropna(subset=['Umsatz'])  # Drop NaN values
df = df[~np.isinf(df['Umsatz'])]   # Drop infinite values

# Plot histogram of 'Umsatz'
plt.figure(figsize=(12, 6))
plt.hist(df['Umsatz'], bins=30, color='blue', edgecolor='black', linewidth=0.2)
plt.title('Histogram of Umsatz')
plt.xlabel('Umsatz')
plt.ylabel('Frequency')
plt.grid()
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Sample DataFrame for demonstration
# df = pd.DataFrame({
#     'A': np.random.normal(loc=0, scale=1, size=1000),  # Normally distributed
#     'B': np.random.uniform(low=0, high=10, size=1000),  # Uniformly distributed
#     'C': np.random.normal(loc=5, scale=2, size=1000)   # Normally distributed
# })

# Loop through each numeric column
for column in df.select_dtypes(include=[np.number]).columns:
    # Perform the Shapiro-Wilk test
    stat, p_value = stats.shapiro(df[column].dropna())  # Drop NaN values for the test
    print(f"Column: {column}, Shapiro-Wilk Test Statistic: {stat}, p-value: {p_value}")

    # Check if the p-value is less than the significance level (e.g., 0.05)
    if p_value > 0.05:
        print(f"The distribution of {column} appears to be normal (fail to reject H0).")
    else:
        print(f"The distribution of {column} does not appear to be normal (reject H0).")

    # Plot histogram
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.hist(df[column].dropna(), bins=30, color='blue', edgecolor='black', linewidth=0.2)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')

    # Q-Q plot
    plt.subplot(1, 2, 2)
    stats.probplot(df[column].dropna(), dist="norm", plot=plt)
    plt.title(f'Q-Q Plot of {column}')

    plt.tight_layout()
    plt.show()
