<a href="https://colab.research.google.com/github/amirmohammadkalateh/analyze/blob/main/1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

# Generate sample dataset
np.random.seed(42)
n_samples = 1000

# Create a sample dataset
data = {
    'Age': np.random.normal(35, 10, n_samples),
    'Income': np.random.normal(50000, 15000, n_samples),
    'Years_Experience': np.random.normal(10, 5, n_samples),
    'Performance_Score': np.random.normal(7.5, 1.5, n_samples),
    'Department': np.random.choice(['HR', 'IT', 'Sales', 'Marketing'], n_samples)
}

df = pd.DataFrame(data)

# 1. Basic Statistical Analysis
print("\n1. Basic Statistical Analysis:")
print("\nDescriptive Statistics:")
print(df.describe())

print("\nSkewness:")
print(df.select_dtypes(include=[np.number]).skew())

print("\nKurtosis:")
print(df.select_dtypes(include=[np.number]).kurtosis())

print("\nCorrelation Matrix:")
print(df.select_dtypes(include=[np.number]).corr())

# 2. Group Statistics
print("\n2. Group Statistics:")
print("\nMean by Department:")
print(df.groupby('Department').mean())

print("\nAggregate Statistics by Department:")
print(df.groupby('Department').agg(['mean', 'std', 'min', 'max']))

# Visualizations
# 1. Histogram
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.hist(df['Age'], bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.subplot(122)
sns.histplot(data=df, x='Income', hue='Department', multiple="stack")
plt.title('Income Distribution by Department')
plt.savefig('histogram.png')
plt.close()

# 2. Box Plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Department', y='Performance_Score', data=df)
plt.title('Performance Score Distribution by Department')
plt.savefig('boxplot.png')
plt.close()

# 3. Scatter Plot Matrix
sns.pairplot(df.drop('Department', axis=1))
plt.savefig('pairplot.png')
plt.close()

# 4. Correlation Heatmap
plt.figure(figsize=(10, 8))
numerical_cols = ['Age', 'Income', 'Years_Experience', 'Performance_Score']
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('heatmap.png')
plt.close()

# 5. 3D Scatter Plot using Plotly
fig = px.scatter_3d(df,
                    x='Age',
                    y='Income',
                    z='Performance_Score',
                    color='Department',
                    title='3D Scatter Plot')
fig.write_html('3d_scatter.html')

# 6. Violin Plot
plt.figure(figsize=(12, 6))
sns.violinplot(x='Department', y='Income', data=df)
plt.title('Income Distribution by Department (Violin Plot)')
plt.savefig('violin.png')
plt.close()

# Advanced Statistical Analysis
# 1. Z-score analysis
scaler = StandardScaler()
numerical_cols = ['Age', 'Income', 'Years_Experience', 'Performance_Score']
z_scores = pd.DataFrame(scaler.fit_transform(df[numerical_cols]), columns=numerical_cols)
print("\nZ-scores Summary:")
print(z_scores.describe())

# 2. Quartile Analysis
print("\nQuartile Analysis:")
for col in numerical_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    print(f"\n{col}:")
    print(f"IQR: {iqr}")
    print(f"Lower bound: {q1 - 1.5*iqr}")
    print(f"Upper bound: {q3 + 1.5*iqr}")

# 3. Distribution Tests
from scipy import stats
for col in numerical_cols:
    stat, p_value = stats.normaltest(df[col])
    print(f"\nNormality test for {col}:")
    print(f"p-value: {p_value}")

print("\nAnalysis completed! Check the generated visualization files.")



1. Basic Statistical Analysis:

Descriptive Statistics:
               Age        Income  Years_Experience  Performance_Score
count  1000.000000   1000.000000       1000.000000        1000.000000
mean     35.193321  51062.543559         10.029171           7.471921
std       9.792159  14961.815658          4.917271           1.540699
min       2.587327   5894.170480         -5.097561           3.105827
25%      28.524097  40906.374665          6.760002           6.393869
50%      35.253006  50946.156985          9.998746           7.500277
75%      41.479439  60933.232655         13.304577           8.500418
max      73.527315  97896.613518         29.631189          12.364639

Skewness:
Age                  0.116976
Income              -0.049396
Years_Experience     0.061247
Performance_Score   -0.002121
dtype: float64

Kurtosis:
Age                  0.072562
Income               0.058403
Years_Experience     0.172201
Performance_Score   -0.229274
dtype: float64

Correlation Matrix:
