# Cognifyz Internship Program - EDA

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

In [32]:
df = pd.read_csv("Internship Dataset.csv")

In [34]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [36]:
col_name = "restaurant_name" if "restaurant_name" in df.columns else df.columns[0]
col_city = "city" if "city" in df.columns else None
col_cuisines = "cuisines" if "cuisines" in df.columns else None
col_rating = "aggregate_rating" if "aggregate_rating" in df.columns else None
col_votes = "votes" if "votes" in df.columns else None
col_price = "price_range" if "price_range" in df.columns else None
col_online = "has_online_delivery" if "has_online_delivery" in df.columns else None
col_table = "has_table_booking" if "has_table_booking" in df.columns else None

In [19]:
df

Unnamed: 0,restaurant_id,restaurant_name,country_code,city,address,locality,locality_verbose,longitude,latitude,cuisines,...,currency,has_table_booking,has_online_delivery,is_delivering_now,switch_to_order_menu,price_range,aggregate_rating,rating_color,rating_text,votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.584450,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9546,5915730,Naml۱ Gurme,208,��stanbul,"Kemanke�� Karamustafa Pa��a Mahallesi, R۱ht۱m ...",Karak�_y,"Karak�_y, ��stanbul",28.977392,41.022793,Turkish,...,Turkish Lira(TL),No,No,No,No,3,4.1,Green,Very Good,788
9547,5908749,Ceviz A��ac۱,208,��stanbul,"Ko��uyolu Mahallesi, Muhittin ��st�_nda�� Cadd...",Ko��uyolu,"Ko��uyolu, ��stanbul",29.041297,41.009847,"World Cuisine, Patisserie, Cafe",...,Turkish Lira(TL),No,No,No,No,3,4.2,Green,Very Good,1034
9548,5915807,Huqqa,208,��stanbul,"Kuru�_e��me Mahallesi, Muallim Naci Caddesi, N...",Kuru�_e��me,"Kuru�_e��me, ��stanbul",29.034640,41.055817,"Italian, World Cuisine",...,Turkish Lira(TL),No,No,No,No,4,3.7,Yellow,Good,661
9549,5916112,A���k Kahve,208,��stanbul,"Kuru�_e��me Mahallesi, Muallim Naci Caddesi, N...",Kuru�_e��me,"Kuru�_e��me, ��stanbul",29.036019,41.057979,Restaurant Cafe,...,Turkish Lira(TL),No,No,No,No,4,4.0,Green,Very Good,901


In [21]:
def split_cuisines(val):
    if pd.isna(val): return []
    return [c.strip() for c in re.split(r"[,/]| & ", str(val)) if c.strip()]

# Level 1

### Top Cuisines - Determine the top three most common cuisines in the dataset.

In [38]:
if col_cuisines:
    df["_cuisine_list"] = df[col_cuisines].apply(split_cuisines)
    all_cuisines = [c for lst in df["_cuisine_list"] for c in lst]
    cuisine_counts = pd.Series(Counter(all_cuisines)).sort_values(ascending=False)
    top3 = cuisine_counts.head(3)

    print("\nTask 1: Top 3 Cuisines")
    print(top3)



Task 1: Top 3 Cuisines
North Indian    3960
Chinese         2735
Fast Food       1986
dtype: int64


In [40]:
for c in top3.index:
        percent = df["_cuisine_list"].apply(lambda x: c in x).mean() * 100
        print(f"{c}: {percent:.2f}% of restaurants")


North Indian: 41.46% of restaurants
Chinese: 28.61% of restaurants
Fast Food: 20.79% of restaurants


### --- Task 2: City Analysis ---

In [44]:
if col_city:
    city_counts = df.groupby(col_city)[col_name].nunique().sort_values(ascending=False)
    city_avg_rating = df.groupby(col_city)[col_rating].mean().sort_values(ascending=False)

    print("\nTask 2: City with Most Restaurants ->", city_counts.idxmax())
    print("Task 2: City with Highest Avg Rating ->", city_avg_rating.idxmax())


Task 2: City with Most Restaurants -> New Delhi
Task 2: City with Highest Avg Rating -> Inner City


In [46]:
    city_summary = pd.DataFrame({
        "Restaurants": city_counts,
        "Avg_Rating": df.groupby(col_city)[col_rating].mean()
    }).sort_values("Restaurants", ascending=False)
print("\nCity Summary (Top 5):")
    print(city_summary.head())


City Summary (Top 5):
           Restaurants  Avg_Rating
city                              
New Delhi         4295    2.438845
Gurgaon            943    2.651431
Noida              902    2.036204
Faridabad          235    1.866932
Ghaziabad           25    2.852000


### --- Task 3: Price Range Distribution ---

In [49]:
if col_price:
    price_counts = df[col_price].value_counts().sort_index()
    price_percent = (price_counts / len(df) * 100).round(2)
    price_dist = pd.DataFrame({"Count": price_counts, "Percent": price_percent})
    print("\nTask 3: Price Range Distribution")
    print(price_dist)


Task 3: Price Range Distribution
             Count  Percent
price_range                
1             4444    46.53
2             3113    32.59
3             1408    14.74
4              586     6.14


### --- Task 4: Online Delivery ---

In [52]:
if col_online:
    df[col_online] = df[col_online].astype(str).str.lower().map({"yes":1,"no":0})
    pct_online = df[col_online].mean() * 100
    ratings_by_delivery = df.groupby(col_online)[col_rating].mean()

    print("\nTask 4: % of Restaurants with Online Delivery ->", round(pct_online, 2))
    print("Avg Rating (Online Delivery = Yes):", ratings_by_delivery.get(1, np.nan))
    print("Avg Rating (Online Delivery = No):", ratings_by_delivery.get(0, np.nan))


Task 4: % of Restaurants with Online Delivery -> 25.66
Avg Rating (Online Delivery = Yes): 3.2488372093023252
Avg Rating (Online Delivery = No): 2.465295774647887


# LEVEL 2

### --- Task 1: Ratings ---

In [57]:
if col_rating:
    most_common_range = pd.cut(df[col_rating], bins=10).value_counts().idxmax()
    avg_votes = df[col_votes].mean() if col_votes else None
    print("\nTask 1: Most Common Rating Range:", most_common_range)
    print("Average Votes:", avg_votes)


Task 1: Most Common Rating Range: (2.94, 3.43]
Average Votes: 156.909747670401


### --- Task 2: Cuisine Combinations ---

In [66]:
if col_cuisines:
    df["_combo"] = df["_cuisine_list"].apply(lambda lst: ", ".join(sorted(lst)))
    combo_counts = df["_combo"].value_counts().head(10)
    print("\nTask 2: Top Cuisine Combinations")
    print(combo_counts)


Task 2: Top Cuisine Combinations
_combo
North Indian                      936
Chinese, North Indian             616
Mughlai, North Indian             394
Chinese                           354
Fast Food                         354
Chinese, Mughlai, North Indian    306
Cafe                              299
Bakery                            218
Bakery, Desserts                  181
Chinese, Fast Food                159
Name: count, dtype: int64


In [68]:
    combo_rating = df.groupby("_combo")[col_rating].agg(["count","mean"])
    high_rating_combos = combo_rating[combo_rating["count"] >= 5].sort_values("mean", ascending=False).head(10)
    print("\nCuisine Combinations with Highest Avg Rating (min 5 restaurants):")
    print(high_rating_combos)


Cuisine Combinations with Highest Avg Rating (min 5 restaurants):
                                       count      mean
_combo                                                
European, Mediterranean, North Indian      8  4.587500
Burger                                     6  4.450000
Modern Indian                             11  4.345455
Indian                                    18  4.250000
International                              6  4.233333
Italian, Pizza, Sandwich                   5  4.220000
French                                     7  4.185714
Steak                                      7  4.185714
Pizza, Sandwich                            5  4.140000
Seafood                                   14  4.114286


### --- Task 3: Geographic Analysis ---

In [72]:
if "longitude" in df.columns and "latitude" in df.columns:
    print("\nTask 3: Geographic Analysis (sample coordinates)")
    print(df[["longitude","latitude"]].dropna().head())


Task 3: Geographic Analysis (sample coordinates)
    longitude   latitude
0  121.027535  14.565443
1  121.014101  14.553708
2  121.056831  14.581404
3  121.056475  14.585318
4  121.057508  14.584450


### --- Task 4: Restaurant Chains ---

In [76]:
chain_counts = df[col_name].value_counts()
chains = chain_counts[chain_counts > 1]
if not chains.empty:
    chain_summary = df[df[col_name].isin(chains.index)].groupby(col_name).agg(
        Restaurants=(col_name, "count"),
        Avg_Rating=(col_rating, "mean"),
        Total_Votes=(col_votes, "sum"),
        Cities=(col_city, lambda x: ", ".join(sorted(set(x.dropna()))))
    ).sort_values("Restaurants", ascending=False)

    print("\nTask 4: Restaurant Chains (Top 5):")
    print(chain_summary.head())


Task 4: Restaurant Chains (Top 5):
                  Restaurants  Avg_Rating  Total_Votes  \
restaurant_name                                          
Cafe Coffee Day            83    2.419277         2428   
Domino's Pizza             79    2.740506         6643   
Subway                     63    2.907937         6124   
Green Chick Chop           51    2.672549          964   
McDonald's                 48    3.339583         5291   

                                                             Cities  
restaurant_name                                                      
Cafe Coffee Day     Faridabad, Ghaziabad, Gurgaon, New Delhi, Noida  
Domino's Pizza    Aurangabad, Faridabad, Ghaziabad, Gurgaon, Lud...  
Subway              Allahabad, Faridabad, Gurgaon, New Delhi, Noida  
Green Chick Chop               Faridabad, Gurgaon, New Delhi, Noida  
McDonald's        Allahabad, Faridabad, Ghaziabad, Gurgaon, New ...  


# Level 3

### --- Task 1: Restaurant Reviews ---

In [15]:
import pandas as pd
from collections import Counter

# Load your dataset (replace with your actual file name)
df = pd.read_csv('restaurant_data.csv')

# Check if 'Reviews' column exists
if 'Reviews' in df.columns and 'Aggregate_rating' in df.columns:
    # Combine all non-null reviews into one big string and split into words
    positive_keywords = Counter(' '.join(df['Reviews'].dropna()).split())

    # Create a new column for review length
    df['Review_length'] = df['Reviews'].fillna('').str.len()
    
    # Calculate correlation between review length and aggregate rating
    length_rating_corr = df[['Review_length', 'Aggregate_rating']].corr()
    
    print("Most common keywords in reviews:")
    print(positive_keywords.most_common(10))
    print("\nCorrelation between review length and rating:")
    print(length_rating_corr)
else:
    print("The required columns 'Reviews' and/or 'Aggregate_rating' do not exist in the dataset.")

FileNotFoundError: [Errno 2] No such file or directory: 'restaurant_data.csv'

### --- Task 2: Votes Analysis ---

In [117]:
if col_votes:
    top_voted = df.loc[df[col_votes].idxmax(), [col_name, col_city, col_votes, col_rating]]
    low_voted = df.loc[df[col_votes].idxmin(), [col_name, col_city, col_votes, col_rating]]
    corr = df[[col_votes, col_rating]].corr().iloc[0,1]

    print("\nTask 2: Top Voted Restaurant\n", top_voted)
    print("Lowest Voted Restaurant\n", low_voted)
    print("Correlation between Votes & Rating:", corr)


Task 2: Top Voted Restaurant
 restaurant_name          Toit
city                Bangalore
votes                   10934
aggregate_rating          4.8
Name: 728, dtype: object
Lowest Voted Restaurant
 restaurant_name     Cantinho da Gula
city                      S��o Paulo
votes                              0
aggregate_rating                 0.0
Name: 69, dtype: object
Correlation between Votes & Rating: 0.31369058419540985


### --- Task 3: Price Range vs Online Delivery & Table Booking ---

In [120]:
if col_price and col_online and col_table:
    df[col_table] = df[col_table].astype(str).str.lower().map({"yes":1,"no":0})
    piv = df.groupby(col_price).agg(
        Restaurants=(col_price, "count"),
        OnlineDelivery=(col_online, "mean"),
        TableBooking=(col_table, "mean")
    )
    print("\nTask 3: Price Range vs Services")
    print(piv)


Task 3: Price Range vs Services
             Restaurants  OnlineDelivery  TableBooking
price_range                                           
1                   4444        0.157741      0.000225
2                   3113        0.413106      0.076775
3                   1408        0.291903      0.457386
4                    586        0.090444      0.467577
