# National Mental Health Datahton

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
import re
import os
import io
import difflib
from collections import Counter
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from contextlib import redirect_stdout
import tqdm
from datetime import datetime
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from textblob import TextBlob
from typing import Tuple, Dict, Any

## Utilities

#### Capture Print

In [2]:
# Function to capture print outputs
def capture_print(func, filename):
    f = io.StringIO()
    with redirect_stdout(f):
        func()
    output = f.getvalue()
    if output.strip(): 
        with open(f'text_outputs/{filename}.txt', 'w') as file:
            file.write(output)

#### Load Data

In [None]:
# Load all .csv files within the input folder
# Code to be revised based on data format provided
def load_csv_files(folder_path):
    all_data = []
    file_info =[]
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            try:
                for encoding in ['utf-8']:
                    try:
                        df = pd.read_csv(file_path, encoding=encoding, low_memory=False)
                        df['Assessment Completion Date'] = pd.to_datetime(
                                df['Assessment Completion Date'], 
                                format='mixed',
                                dayfirst=True
                            )
                        break
                    except UnicodeDecodeError:
                        continue
                else:
                    print(f"Failed to read {filename} with any encoding")
                    continue
                
                print(f"Successfully loaded: {filename}")
                all_data.append(df)
                
                # Collect information about the file
                file_info.append({
                    'filename': filename,
                    'rows': len(df),
                    'columns': len(df.columns),
                    'memory_usage': df.memory_usage(deep=True).sum() / 1024**2  # in MB
                })
                
            except Exception as e:
                print(f"Error loading {filename}: {str(e)}")
    
    if not all_data:
        raise ValueError("No CSV files were successfully loaded")
    
    # Concatenate all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Print information about loaded files
    print("\nFile Loading Summary:")
    for info in file_info:
        print(f"{info['filename']}: {info['rows']} rows, {info['columns']} columns, {info['memory_usage']:.2f} MB")
    
    return combined_df

# Specify the path to the input folder
input_folder = 'input'

# Load all CSV files from the input folder
df = load_csv_files(input_folder)

# Display the first few rows of the combined dataframe
print(df.head())

# Display information about the combined dataframe
print("\nDataframe Info:")
print(df.info())

# Display the number of rows in the combined dataframe
print(f"\nTotal number of rows: {len(df)}")

# Display the number of unique files processed
print(f"Number of files processed: {len(os.listdir(input_folder))}")