In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class DataProfiler:
    
    def __init__(self, dataframe, dataset_name="Dataset"):
       
        self.df = dataframe.copy()
        self.dataset_name = dataset_name
        self.profile_results = {}
        
    def basic_info(self):
       
        print("="*80)
        print(f"DATA PROFILING REPORT: {self.dataset_name}")
        print("="*80)
        print(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print()
        
        basic_stats = {
            'total_rows': len(self.df),
            'total_columns': len(self.df.columns),
            'memory_usage_mb': self.df.memory_usage(deep=True).sum() / 1024**2,
            'duplicate_rows': self.df.duplicated().sum(),
            'duplicate_percentage': (self.df.duplicated().sum() / len(self.df)) * 100
        }
        
        print("BASIC INFORMATION")
        print("-" * 40)
        print(f"Total Rows: {basic_stats['total_rows']:,}")
        print(f"Total Columns: {basic_stats['total_columns']:,}")
        print(f"Memory Usage: {basic_stats['memory_usage_mb']:.2f} MB")
        print(f"Duplicate Rows: {basic_stats['duplicate_rows']:,} ({basic_stats['duplicate_percentage']:.2f}%)")
        print()
        
        self.profile_results['basic_info'] = basic_stats
        return basic_stats
    
    def data_types_analysis(self):
       
        print("DATA TYPES ANALYSIS")
        print("-" * 40)
        
        dtype_info = []
        for col in self.df.columns:
            col_info = {
                'column': col,
                'dtype': str(self.df[col].dtype),
                'non_null_count': self.df[col].count(),
                'null_count': self.df[col].isnull().sum(),
                'null_percentage': (self.df[col].isnull().sum() / len(self.df)) * 100,
                'unique_values': self.df[col].nunique(),
                'unique_percentage': (self.df[col].nunique() / len(self.df)) * 100
            }
            dtype_info.append(col_info)
        
        dtype_df = pd.DataFrame(dtype_info)
        print(dtype_df.to_string(index=False))
        print()
        
        self.profile_results['data_types'] = dtype_df
        return dtype_df
    
    def missing_values_analysis(self):
       
        print("MISSING VALUES ANALYSIS")
        print("-" * 40)
        
        missing_info = []
        for col in self.df.columns:
            missing_count = self.df[col].isnull().sum()
            missing_percentage = (missing_count / len(self.df)) * 100
            
            if missing_count > 0:
                missing_info.append({
                    'column': col,
                    'missing_count': missing_count,
                    'missing_percentage': missing_percentage,
                    'data_type': str(self.df[col].dtype)
                })
        
        if missing_info:
            missing_df = pd.DataFrame(missing_info)
            missing_df = missing_df.sort_values('missing_percentage', ascending=False)
            print(missing_df.to_string(index=False))
            
            print("\nMISSING VALUE PATTERNS")
            print("-" * 30)
            high_missing = missing_df[missing_df['missing_percentage'] > 50]
            if not high_missing.empty:
                print("Columns with >50% missing values:")
                for _, row in high_missing.iterrows():
                    print(f"• {row['column']}: {row['missing_percentage']:.1f}% missing")
            else:
                print("No columns have >50% missing values.")
        else:
            print("No missing values found in the dataset.")
        
        print()
        self.profile_results['missing_values'] = missing_info
        return missing_info
    
    def statistical_summary(self):
       
        print("STATISTICAL SUMMARY")
        print("-" * 40)
        
        numerical_cols = self.df.select_dtypes(include=[np.number]).columns
        
        if len(numerical_cols) > 0:
            stats_summary = self.df[numerical_cols].describe()
            print("DESCRIPTIVE STATISTICS")
            print(stats_summary.round(3))
            print()
            
            additional_stats = []
            for col in numerical_cols:
                col_data = self.df[col].dropna()
                if len(col_data) > 0:
                    stats_info = {
                        'column': col,
                        'skewness': stats.skew(col_data),
                        'kurtosis': stats.kurtosis(col_data),
                        'variance': col_data.var(),
                        'coefficient_of_variation': col_data.std() / col_data.mean() if col_data.mean() != 0 else np.inf
                    }
                    additional_stats.append(stats_info)
            
            if additional_stats:
                additional_df = pd.DataFrame(additional_stats)
                print("ADDITIONAL STATISTICS")
                print(additional_df.round(4).to_string(index=False))
            
            self.profile_results['statistical_summary'] = {
                'descriptive': stats_summary,
                'additional': additional_df if additional_stats else None
            }
        else:
            print("No numerical columns found for statistical analysis.")
        
        print()
    
    def outlier_detection(self):
      
        print("OUTLIER DETECTION")
        print("-" * 40)
        
        numerical_cols = self.df.select_dtypes(include=[np.number]).columns
        outlier_info = []
        
        for col in numerical_cols:
            col_data = self.df[col].dropna()
            if len(col_data) > 0:
                Q1 = col_data.quantile(0.25)
                Q3 = col_data.quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
                outlier_count = len(outliers)
                outlier_percentage = (outlier_count / len(col_data)) * 100
                
                if outlier_count > 0:
                    outlier_info.append({
                        'column': col,
                        'outlier_count': outlier_count,
                        'outlier_percentage': outlier_percentage,
                        'lower_bound': lower_bound,
                        'upper_bound': upper_bound,
                        'min_outlier': outliers.min(),
                        'max_outlier': outliers.max()
                    })
        
        if outlier_info:
            outlier_df = pd.DataFrame(outlier_info)
            print(outlier_df.round(3).to_string(index=False))
        else:
            print("No outliers detected using IQR method.")
        
        print()
        self.profile_results['outliers'] = outlier_info
        return outlier_info
    
    def correlation_analysis(self):
       
        print("CORRELATION ANALYSIS")
        print("-" * 40)
        
        numerical_cols = self.df.select_dtypes(include=[np.number]).columns
        
        if len(numerical_cols) > 1:
            correlation_matrix = self.df[numerical_cols].corr()
            
            high_corr_pairs = []
            for i in range(len(correlation_matrix.columns)):
                for j in range(i+1, len(correlation_matrix.columns)):
                    corr_value = correlation_matrix.iloc[i, j]
                    if abs(corr_value) > 0.7:  # High correlation threshold
                        high_corr_pairs.append({
                            'variable_1': correlation_matrix.columns[i],
                            'variable_2': correlation_matrix.columns[j],
                            'correlation': corr_value
                        })
            
            if high_corr_pairs:
                print("HIGH CORRELATIONS (|r| > 0.7):")
                high_corr_df = pd.DataFrame(high_corr_pairs)
                high_corr_df = high_corr_df.sort_values('correlation', key=abs, ascending=False)
                print(high_corr_df.round(4).to_string(index=False))
            else:
                print("No high correlations (|r| > 0.7) found between numerical variables.")
            
            self.profile_results['correlation'] = {
                'matrix': correlation_matrix,
                'high_correlations': high_corr_pairs
            }
        else:
            print("Insufficient numerical columns for correlation analysis.")
        
        print()
    
    def categorical_analysis(self):
       
        print("CATEGORICAL VARIABLES ANALYSIS")
        print("-" * 40)
        
        categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns
        
        if len(categorical_cols) > 0:
            cat_info = []
            for col in categorical_cols:
                col_data = self.df[col].dropna()
                value_counts = col_data.value_counts()
                
                cat_stats = {
                    'column': col,
                    'unique_values': len(value_counts),
                    'most_frequent_value': value_counts.index[0] if len(value_counts) > 0 else None,
                    'most_frequent_count': value_counts.iloc[0] if len(value_counts) > 0 else 0,
                    'most_frequent_percentage': (value_counts.iloc[0] / len(col_data)) * 100 if len(value_counts) > 0 else 0,
                    'least_frequent_value': value_counts.index[-1] if len(value_counts) > 0 else None,
                    'least_frequent_count': value_counts.iloc[-1] if len(value_counts) > 0 else 0
                }
                cat_info.append(cat_stats)
            
            cat_df = pd.DataFrame(cat_info)
            print(cat_df.to_string(index=False))
            
            print("\nTOP 5 VALUES PER CATEGORICAL COLUMN")
            print("-" * 50)
            for col in categorical_cols:
                print(f"\n{col.upper()}:")
                top_values = self.df[col].value_counts().head()
                for value, count in top_values.items():
                    percentage = (count / len(self.df)) * 100
                    print(f"  {value}: {count} ({percentage:.1f}%)")
            
            self.profile_results['categorical'] = cat_df
        else:
            print("No categorical columns found in the dataset.")
        
        print()
    
    def data_quality_assessment(self):
      
        print("DATA QUALITY ASSESSMENT")
        print("-" * 40)
        
        quality_scores = {}
        
        total_cells = len(self.df) * len(self.df.columns)
        missing_cells = self.df.isnull().sum().sum()
        completeness_score = ((total_cells - missing_cells) / total_cells) * 100
        quality_scores['completeness'] = completeness_score
        
        uniqueness_score = ((len(self.df) - self.df.duplicated().sum()) / len(self.df)) * 100
        quality_scores['uniqueness'] = uniqueness_score
        
        consistency_issues = 0
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
            
                non_null_values = self.df[col].dropna()
                if len(non_null_values) > 0:
                 
                    numeric_count = sum(str(val).replace('.', '').replace('-', '').isdigit() for val in non_null_values)
                    if 0 < numeric_count < len(non_null_values):
                        consistency_issues += 1
        
        consistency_score = ((len(self.df.columns) - consistency_issues) / len(self.df.columns)) * 100
        quality_scores['consistency'] = consistency_score
        
        overall_score = (completeness_score + uniqueness_score + consistency_score) / 3
        quality_scores['overall'] = overall_score
        
        print(f"Completeness Score: {completeness_score:.2f}%")
        print(f"Uniqueness Score: {uniqueness_score:.2f}%")
        print(f"Consistency Score: {consistency_score:.2f}%")
        print(f"Overall Quality Score: {overall_score:.2f}%")
        
        print("\nQUALITY INTERPRETATION:")
        if overall_score >= 90:
            print("[EXCELLENT] Excellent data quality")
        elif overall_score >= 80:
            print("[GOOD] Good data quality")
        elif overall_score >= 70:
            print("[WARNING] Fair data quality - some improvements needed")
        else:
            print("[WARNING] Poor data quality - significant improvements required")
        
        print()
        self.profile_results['quality_scores'] = quality_scores
        return quality_scores
    
    def generate_recommendations(self):
       
        print("RECOMMENDATIONS")
        print("-" * 40)
        
        recommendations = []
        
        if 'missing_values' in self.profile_results and self.profile_results['missing_values']:
            high_missing = [item for item in self.profile_results['missing_values'] 
                          if item['missing_percentage'] > 50]
            if high_missing:
                recommendations.append("Consider removing columns with >50% missing values or investigate data collection issues")
        
        if 'basic_info' in self.profile_results:
            if self.profile_results['basic_info']['duplicate_percentage'] > 5:
                recommendations.append("High percentage of duplicate rows detected - consider data deduplication")
        
        if 'outliers' in self.profile_results and self.profile_results['outliers']:
            high_outlier_cols = [item for item in self.profile_results['outliers'] 
                               if item['outlier_percentage'] > 10]
            if high_outlier_cols:
                recommendations.append("Several columns have high outlier percentages - investigate data collection or consider outlier treatment")
        
        if 'correlation' in self.profile_results and self.profile_results['correlation']['high_correlations']:
            recommendations.append("High correlations detected between variables - consider feature selection or dimensionality reduction")
        
        if 'quality_scores' in self.profile_results:
            if self.profile_results['quality_scores']['overall'] < 80:
                recommendations.append("Overall data quality is below 80% - prioritize data cleaning and validation")
        
        if recommendations:
            for i, rec in enumerate(recommendations, 1):
                print(f"{i}. {rec}")
        else:
            print("No specific recommendations - your data quality appears to be good!")
        
        print()
        self.profile_results['recommendations'] = recommendations
        return recommendations
    
    def generate_full_report(self):
        
        self.basic_info()
        self.data_types_analysis()
        self.missing_values_analysis()
        self.statistical_summary()
        self.outlier_detection()
        self.correlation_analysis()
        self.categorical_analysis()
        self.data_quality_assessment()
        self.generate_recommendations()
        
        print("="*80)
        print("DATA PROFILING REPORT COMPLETED")
        print("="*80)
        
        return self.profile_results


def profile_dataset(df, dataset_name="Dataset"):
    
    profiler = DataProfiler(df, dataset_name)
    return profiler.generate_full_report()


def load_and_profile_csv(file_path, dataset_name=None):
   
    try:
        print(f"Loading data from: {file_path}")
        df = pd.read_csv(file_path)
        print(f"[SUCCESS] Successfully loaded {len(df)} rows and {len(df.columns)} columns")
        print()
        
        if dataset_name is None:
            dataset_name = file_path.split('/')[-1].split('\\')[-1].replace('.csv', '').replace('_', ' ').title()
        
        results = profile_dataset(df, dataset_name)
        
        return df, results
        
    except FileNotFoundError:
        print(f"[ERROR] File '{file_path}' not found.")
        print("Please make sure the file path is correct and the file exists.")
        return None, None
    except pd.errors.EmptyDataError:
        print("[ERROR] The CSV file is empty.")
        return None, None
    except pd.errors.ParserError as e:
        print(f"[ERROR] Error parsing CSV file: {e}")
        print("Please check if the file is properly formatted.")
        return None, None
    except Exception as e:
        print(f"[ERROR] Unexpected error: {e}")
        return None, None

if __name__ == "__main__":
    
    csv_file_path = "bank_cleaned.csv" 
    
    print("LOADING AND PROFILING YOUR CSV FILE")
    print("="*80)
    
    df, results = load_and_profile_csv(csv_file_path)
    
    if df is not None:
        print("\n" + "="*80)
        print("ADDITIONAL ANALYSIS OPTIONS")
        print("="*80)
        print("Your data has been loaded successfully!")
        print(f"Dataset shape: {df.shape}")
        print("\nFirst few rows of your data:")
        print(df.head())
        
    else:
        print("\n" + "="*80)
        print("FALLBACK: USING SAMPLE DATA FOR DEMONSTRATION")
        print("="*80)
       
        np.random.seed(42)
        sample_data = {
            'customer_id': range(1, 1001),
            'age': np.random.normal(35, 12, 1000).astype(int),
            'income': np.random.normal(50000, 15000, 1000),
            'category': np.random.choice(['A', 'B', 'C', 'D'], 1000),
            'score': np.random.normal(75, 15, 1000),
            'status': np.random.choice(['Active', 'Inactive', 'Pending'], 1000, p=[0.6, 0.3, 0.1])
        }

        sample_df = pd.DataFrame(sample_data)
        
        sample_df.loc[np.random.choice(sample_df.index, 50), 'income'] = np.nan
        sample_df.loc[np.random.choice(sample_df.index, 30), 'score'] = np.nan
        
        duplicate_rows = sample_df.sample(20)
        sample_df = pd.concat([sample_df, duplicate_rows], ignore_index=True)
        
        results = profile_dataset(sample_df, "Sample Customer Dataset")

def quick_preview(file_path, rows=5):
   
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset Preview: {file_path}")
        print(f"Shape: {df.shape}")
        print("\nFirst {rows} rows:")
        print(df.head(rows))
        print("\nColumn names:")
        print(df.columns.tolist())
        print("\nData types:")
        print(df.dtypes)
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        return None

def save_profiling_report(results, output_file="profiling_report.txt"):
    
    try:
        with open(output_file, 'w') as f:
            f.write("DATA PROFILING REPORT\n")
            f.write("="*50 + "\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            if 'basic_info' in results:
                f.write("BASIC INFORMATION\n")
                f.write("-"*30 + "\n")
                for key, value in results['basic_info'].items():
                    f.write(f"{key}: {value}\n")
                f.write("\n")
            
            if 'recommendations' in results:
                f.write("RECOMMENDATIONS\n")
                f.write("-"*30 + "\n")
                for i, rec in enumerate(results['recommendations'], 1):
                    f.write(f"{i}. {rec}\n")
        
        print(f"[SUCCESS] Profiling report saved to: {output_file}")
        
    except Exception as e:
        print(f"Error saving report: {e}")

LOADING AND PROFILING YOUR CSV FILE
Loading data from: bank_cleaned.csv
[ERROR] File 'bank_cleaned.csv' not found.
Please make sure the file path is correct and the file exists.

FALLBACK: USING SAMPLE DATA FOR DEMONSTRATION
DATA PROFILING REPORT: Sample Customer Dataset
Report Generated: 2025-08-02 19:59:44

BASIC INFORMATION
----------------------------------------
Total Rows: 1,020
Total Columns: 6
Memory Usage: 0.15 MB
Duplicate Rows: 20 (1.96%)

DATA TYPES ANALYSIS
----------------------------------------
     column   dtype  non_null_count  null_count  null_percentage  unique_values  unique_percentage
customer_id   int64            1020           0         0.000000           1000          98.039216
        age   int64            1020           0         0.000000             66           6.470588
     income float64             968          52         5.098039            950          93.137255
   category  object            1020           0         0.000000              4         