# Telecom Dataset Statistical Analysis
This notebook performs exploratory data analysis and statistical summaries for the telecom dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Load the dataset

In [None]:
df = pd.read_csv('Telecom Dataset.csv')
df.head()

## Dataset Overview

In [None]:
df.info()

In [None]:
df.describe(include='all')

## Check for Missing Values

In [None]:
df.isnull().sum()

## Visualizing Feature Distributions

In [None]:
# Convert yes/no to binary for analysis
yes_no_columns = ['Blue', 'Wi_Fi', 'Tch_Scr', 'Ext_Mem']
df[yes_no_columns] = df[yes_no_columns].replace({'yes': 1, 'no': 0})

# Plot histograms
df.hist(figsize=(15, 12), bins=20)
plt.tight_layout()
plt.show()

## Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

## Price vs. Features

In [None]:
features = ['RAM', 'Bty_Pwr', 'Int_Mem', 'Px_h', 'Px_w', 'Weight']
for feature in features:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=df[feature], y=df['Price'])
    plt.title(f'Price vs {feature}')
    plt.show()

## Part 2: Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Feature selection
features = ['RAM', 'Bty_Pwr', 'Int_Mem', 'Px_h', 'Px_w', 'Weight']
X = df[features]
y = df['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Results
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

## Part 3: Hypothesis Testing

In [None]:
import scipy.stats as stats

# T-test: Does Bluetooth (Blue) impact price?
group1 = df[df['Blue'] == 1]['Price']
group0 = df[df['Blue'] == 0]['Price']
t_stat, p_value = stats.ttest_ind(group1, group0)
print(f"T-statistic: {t_stat:.2f}, P-value: {p_value:.4f}")
if p_value < 0.05:
    print("Result: Significant difference in price based on Bluetooth")
else:
    print("Result: No significant difference")

## Part 4: Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(components[:, 0], components[:, 1], c=y, cmap='viridis', edgecolor='k')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA Projection of Telecom Data')
plt.colorbar(label='Price')
plt.show()

## Part 5: Optimization Insight

In [None]:
# Optimization: Find best configuration for highest price
top_features = df.sort_values(by='Price', ascending=False)[features].head()
top_features