In [3]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go


In [5]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go

# Step 1: Data Cleaning and Preprocessing
df = pd.read_csv("flipkart_com-ecommerce_sample.csv")

# Handling missing values
df.dropna(subset=['retail_price', 'discounted_price', 'description'], inplace=True)

# Convert date columns to datetime data type
df['crawl_timestamp'] = pd.to_datetime(df['crawl_timestamp'])

# Convert 'product_rating' and 'overall_rating' to numeric
df['product_rating'] = pd.to_numeric(df['product_rating'], errors='coerce')
df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

# Extract the primary category from 'product_category_tree'
df['category'] = df['product_category_tree'].apply(lambda x: x.split('>>')[0][2:])

# Removing rows where 'discounted_price' is greater than 'retail_price'
df = df[df['discounted_price'] <= df['retail_price']]

# Step 2: Exploratory Data Analysis (EDA)
# 1. Distribution of ratings
rating_distribution_fig = {
    'data': [
        go.Histogram(
            x=df['product_rating'],
            nbinsx=20,
            histnorm='probability'
        )
    ],
    'layout': go.Layout(
        title='Distribution of Product Ratings',
        xaxis={'title': 'Product Rating'},
        yaxis={'title': 'Probability'},
    )
}

# 2. Category-wise distribution
category_counts = df['category'].value_counts().head(10)
category_distribution_fig = {
    'data': [
        go.Bar(
            x=category_counts.index,
            y=category_counts.values
        )
    ],
    'layout': go.Layout(
        title='Top 10 Categories by Count',
        xaxis={'title': 'Category'},
        yaxis={'title': 'Count'},
        xaxis_tickangle=-45,
    )
}

# 3. Retail price vs. Discounted price scatter plot
retail_vs_discounted_fig = {
    'data': [
        go.Scatter(
            x=df['retail_price'],
            y=df['discounted_price'],
            mode='markers',
            marker=dict(size=8, opacity=0.7),
        )
    ],
    'layout': go.Layout(
        title='Retail Price vs. Discounted Price',
        xaxis={'title': 'Retail Price'},
        yaxis={'title': 'Discounted Price'},
    )
}

# 4. Distribution of FK Advantage Products
advantage_counts = df['is_FK_Advantage_product'].value_counts()
advantage_distribution_fig = {
    'data': [
        go.Bar(
            x=advantage_counts.index,
            y=advantage_counts.values
        )
    ],
    'layout': go.Layout(
        title='Distribution of FK Advantage Products',
        xaxis={'title': 'FK Advantage Product'},
        yaxis={'title': 'Count'},
    )
}

# 5. Trend of Crawled Data Over Time
df['crawl_date'] = df['crawl_timestamp'].dt.date
date_counts = df['crawl_date'].value_counts().sort_index()
crawl_trend_fig = {
    'data': [
        go.Scatter(
            x=date_counts.index,
            y=date_counts.values,
            mode='lines+markers',
        )
    ],
    'layout': go.Layout(
        title='Trend of Crawled Data Over Time',
        xaxis={'title': 'Crawl Date'},
        yaxis={'title': 'Number of Entries'},
        xaxis_tickangle=-45,
    )
}

# 6. Top 15 Brands
top_brands = df['brand'].value_counts().head(15)
top_brands_fig = {
    'data': [
        go.Bar(
            x=top_brands.values,
            y=top_brands.index,
            orientation='h',
            marker=dict(color='skyblue'),
        )
    ],
    'layout': go.Layout(
        title='Top 15 Brands',
        xaxis={'title': 'Number of Products'},
        yaxis={'title': 'Brand'},
    )
}

# Create a Dash app
app = dash.Dash(__name__)

# Create layout for the dashboard
app.layout = html.Div([
    html.H1("Flipkart Ecommerce Dashboard", style={'text-align': 'center'}),
    dcc.Graph(id='rating-distribution', figure=rating_distribution_fig),
    dcc.Graph(id='category-distribution', figure=category_distribution_fig),
    dcc.Graph(id='retail-vs-discounted', figure=retail_vs_discounted_fig),
    dcc.Graph(id='advantage-distribution', figure=advantage_distribution_fig),
    dcc.Graph(id='crawl-trend', figure=crawl_trend_fig),
    dcc.Graph(id='top-brands', figure=top_brands_fig),
], style={'padding': '20px'})

# Run the app
# Run the app on a different port, e.g., 8051
if __name__ == '__main__':
    app.run_server(port=8052, debug=True)

