<a href="https://colab.research.google.com/github/ayushpatial2004/RAG-PROJECT/blob/main/Interactive_Data_Analysis_and_Visualization_Tool_using_Python_in_Google_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import time

# Choose file using upload dialog
uploaded = files.upload()

# Get the filename
path = next(iter(uploaded))

# File reading based on extension
if path.endswith('.csv'):
    doc = pd.read_csv(path)
elif path.endswith('.json'):
    doc = pd.read_json(path)
elif path.endswith('.xlsx'):
    doc = pd.read_excel(path)
else:
    print("You have selected the wrong format of the file.")
    print("Try again!")
    exit()

print("Here is the Description of your file uploaded")
print("\n")
print(doc.describe())
print("\n")

print("Wait, data is cleaning....")
doc.dropna(inplace=True)
doc.drop_duplicates(inplace=True)
time.sleep(3)
print("\nData is being cleaned\n")

print("Here are the numerical columns of data sets:")
numerical_columns = doc.select_dtypes(include=['int64', 'float64']).columns
print(doc[numerical_columns].head())

print("\nWait, let me show you the categorical columns of the dataset.....")
time.sleep(3)
print("\nHere are the categorical columns of data sets:")
categorical_columns = doc.select_dtypes(include=['object']).columns
print(doc[categorical_columns].head())

print("Carefully enter the column name (write the name as written in the dataset)")
a = input("Choose the column you want to know: ")

# Data Separation
if a in numerical_columns:
    print("You have chosen the numerical column\n")
    print("Here are some basic math stats of this column\n")
    print(doc[a].describe())
elif a in categorical_columns:
    print("You have chosen the categorical column\n")
    print("Sorry, mathematical operations cannot be performed on categorical columns.")

print("\nNow carefully enter the column name (write the name as written in the dataset)")
b = input("Choose the column you want to analyze: ")

print("Analyzing... Please wait...")
time.sleep(3)
print("\n")

# Profit calculation
if 'Unit_Price' in numerical_columns and 'Quantity_Sold' in numerical_columns:
    profit = doc['Unit_Price'] * doc['Quantity_Sold']
    total_profit = profit.sum()
    print("Here is your total profit:")
    print(profit)

# Visualization
plt.plot(doc[b], color='red')
plt.title("Line Plot")
plt.xlabel(f"{b} Values")
plt.ylabel("Y-axis")
plt.show()

plt.hist(doc[b])
plt.title("Histogram")
plt.xlabel(f"{b} Values")
plt.ylabel("Y-axis")
plt.show()

sns.displot(doc[b], kde=True)
plt.title("Distribution Plot")
plt.xlabel(f"{b} Values")
plt.ylabel("Y-axis")
plt.show()

print("Complete visualization of the dataset for both categorical and numerical columns starts here...\n")

# Categorical Visualizations
print("Visualization for Categorical Data")
for column in categorical_columns:
    value_counts = doc[column].value_counts()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=value_counts.index, y=value_counts.values)
    plt.title(f"Bar Plot of {column}")
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Numerical Visualizations
print("Visualization for Numerical Data\n")

# Pie Chart
column_sums = doc[numerical_columns].sum()
plt.pie(column_sums, labels=column_sums.index, autopct='%1.1f%%', startangle=90)
plt.title("Pie Chart of All Numerical Columns (Summed)")
plt.ylabel("")
plt.show()

# Scatter Plot
sns.scatterplot(data=doc[numerical_columns])
plt.show()

# Correlation Heatmap
correlation_matrix = doc[numerical_columns].corr()
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Heatmap of All Numerical Columns")
plt.show()

# Pair Plot
sns.pairplot(doc[numerical_columns])
plt.suptitle("Pairplot of All Numerical Columns", y=1.02)
plt.show()
