In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('yasotha/CognifyzTasks/Dataset .csv')

# Convert 'Aggregate rating' to numeric
df['Aggregate rating'] = pd.to_numeric(df['Aggregate rating'], errors='coerce')

# Convert 'Votes' to numeric
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Group by 'Cuisines' and calculate average rating and total votes
cuisine_stats = df.groupby('Cuisines').agg({
    'Aggregate rating': 'mean',
    'Votes': 'sum'
}).reset_index()

# Sort cuisines by total votes to identify most popular cuisines
top_cuisines = cuisine_stats.sort_values(by='Votes', ascending=False)

# Identify cuisines with higher average ratings (e.g., above overall mean)
overall_avg_rating = df['Aggregate rating'].mean()
high_rating_cuisines = cuisine_stats[cuisine_stats['Aggregate rating'] > overall_avg_rating]

# Output results
print("Top cuisines by number of votes:")
print(top_cuisines.head(10))

print("\nCuisines with average rating above overall average:")
print(high_rating_cuisines.sort_values(by='Aggregate rating', ascending=False).head(10))

# Visualization 1: Distribution of ratings
plt.figure(figsize=(10, 6))
sns.histplot(df['Aggregate rating'].dropna(), bins=20, kde=True)
plt.title('Distribution of Aggregate Ratings')
plt.xlabel('Aggregate Rating')
plt.ylabel('Frequency')
plt.show()

# Visualization 2: Average ratings by top cuisines
plt.figure(figsize=(12, 8))
top_cuisines_subset = top_cuisines.head(10)
sns.barplot(data=top_cuisines_subset, x='Aggregate rating', y='Cuisines', palette='viridis')
plt.title('Average Ratings of Top 10 Cuisines by Votes')
plt.xlabel('Average Aggregate Rating')
plt.ylabel('Cuisine')
plt.show()

# Visualization 3: Average ratings by city
city_stats = df.groupby('City').agg({
    'Aggregate rating': 'mean',
    'Votes': 'sum'
}).reset_index()
top_cities = city_stats.sort_values(by='Votes', ascending=False).head(10)

plt.figure(figsize=(12, 8))
sns.barplot(data=top_cities, x='Aggregate rating', y='City', palette='magma')
plt.title('Average Ratings of Top 10 Cities by Votes')
plt.xlabel('Average Aggregate Rating')
plt.ylabel('City')
plt.show()

# Visualization 4: Relationship between features and target variable
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Votes', y='Aggregate rating', alpha=0.5)
plt.title('Relationship between Votes and Aggregate Rating')
plt.xlabel('Votes')
plt.ylabel('Aggregate Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Average Cost for two', y='Aggregate rating', alpha=0.5)
plt.title('Relationship between Average Cost for Two and Aggregate Rating')
plt.xlabel('Average Cost for Two')
plt.ylabel('Aggregate Rating')
plt.show()
