In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Excel file
file_path = "/content/Copy of television_dataset_(3)(1).xlsx"  # Replace with the actual path to your Excel file
data = pd.read_excel(file_path)

# Display basic information about the dataset
print(data.info())

# Display the first few rows of the dataset
print(data.head())

# Data Cleaning and Conversion
# Example: Handle column name variations or missing columns
# Check and correct column names if needed
if 'RATING' in data.columns:
    # Convert RATING to float (assuming 'RATING' column needs cleaning)
    data['RATING'] = pd.to_numeric(data['RATING'].str.extract(r'(\d+\.\d+)')[0], errors='coerce')

if 'M.R.P.' in data.columns and 'DISCOUNTED PRICE' in data.columns:
    # Convert M.R.P. and DISCOUNTED PRICE to numeric (assuming these columns need cleaning)
    data['M.R.P.'] = pd.to_numeric(data['M.R.P.'].str.replace('[$,]', '', regex=True), errors='coerce')
    data['DISCOUNTED PRICE'] = pd.to_numeric(data['DISCOUNTED PRICE'].str.replace('[$,]', '', regex=True), errors='coerce')

# Drop rows with missing values
data.dropna(inplace=True)

# Basic Data Analysis and Visualization
plt.figure(figsize=(12, 10))

# Example 1: Histogram of Ratings (assuming 'RATING' column exists)
if 'RATING' in data.columns:
    plt.subplot(2, 2, 1)
    sns.histplot(data['RATING'], bins=20, kde=True)
    plt.title('Histogram of Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Frequency')
else:
    print("Column 'RATING' not found in the dataset. Unable to plot the histogram of ratings.")

# Example 2: Scatter plot of M.R.P. vs. Discounted Price (assuming 'M.R.P.' and 'DISCOUNTED PRICE' columns exist)
if 'M.R.P.' in data.columns and 'DISCOUNTED PRICE' in data.columns:
    plt.subplot(2, 2, 2)
    sns.scatterplot(x='M.R.P.', y='DISCOUNTED PRICE', data=data)
    plt.title('Scatter plot of M.R.P. vs. Discounted Price')
    plt.xlabel('M.R.P.')
    plt.ylabel('Discounted Price')
else:
    print("Columns 'M.R.P.' or 'DISCOUNTED PRICE' not found in the dataset. Unable to plot the scatter plot.")

# Example 3: Bar chart of Average Ratings by Brand (assuming 'BRAND' column exists)
if 'BRAND' in data.columns and 'RATING' in data.columns:
    plt.subplot(2, 2, 3)
    avg_ratings_by_brand = data.groupby('BRAND')['RATING'].mean().sort_values(ascending=False).head(10)
    avg_ratings_by_brand.plot(kind='bar')
    plt.title('Bar chart of Average Ratings by Brand')
    plt.xlabel('Brand')
    plt.ylabel('Average Rating')
    plt.xticks(rotation=45)
else:
    print("Column 'BRAND' or 'RATING' not found in the dataset. Unable to plot the bar chart of average ratings by brand.")

plt.tight_layout()
plt.show()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product name      264 non-null    object 
 1   Unnamed: 1        0 non-null      float64
 2   Unnamed: 2        0 non-null      float64
 3   Unnamed: 3        0 non-null      float64
 4   Unnamed: 4        0 non-null      float64
 5   price             264 non-null    int64  
 6   discount price    264 non-null    int64  
 7   Unnamed: 7        0 non-null      float64
 8   rating            264 non-null    float64
 9   number of rating  264 non-null    object 
dtypes: float64(6), int64(2), object(2)
memory usage: 20.8+ KB
None
                                        product name  Unnamed: 1  Unnamed: 2  \
0  SONY Bravia X75L 108 cm (43 inch) Ultra HD (4K...         NaN         NaN   
1  Mi X Series 108 cm (43 inch) Ultra HD (4K) LED...         NaN         NaN   
2  REDMI 80 cm (32 inch

<Figure size 1200x1000 with 0 Axes>