In [1]:
# Install PySpark (Colab ke liye)
# !pip install pyspark

# Import Pandas and PySpark
import pandas as pd
from pyspark.sql import SparkSession




In [None]:
# Spark session create karte hain
spark = SparkSession.builder.appName("Beginner_Data_Analysis").getOrCreate()
print("Spark Ready ✅")


In [None]:
# Tumhara GitHub repo ka base URL
base = "https://raw.githubusercontent.com/theabhinaykumar/csv/main/"

# Saare file links
user_reviews   = base + "user_reviews.csv"
apps           = base + "apps.csv"
db1_sales      = base + "DB1%20sales.csv"
hospital       = base + "hospital.csv"
ab_nyc_2019    = base + "AB_NYC_2019.csv"
creditcard     = base + "creditcard.csv"
creditcard_1   = base + "creditcard%201.csv"

print(user_reviews)


In [None]:
# Pandas me CSV load karna
df = pd.read_csv(user_reviews)

# Top 5 rows dekhna
print(df.head())

# Shape of dataset
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

# Column names
print(df.columns)


In [None]:
# PySpark me CSV load karna
sdf = spark.read.option("header", True).option("inferSchema", True).csv(user_reviews)

# Top 5 rows dekhna
sdf.show(5)

# Schema dekhna
sdf.printSchema()


In [None]:
# Dataset ka info
print(df.info())

# Summary statistics
print(df.describe())

# Null values count
print(df.isnull().sum())

# Specific columns select
print(df[['reviewer_id', 'rating']].head())

# Rows filter karna (rating > 4)
print(df[df['rating'] > 4])


In [None]:
from pyspark.sql import functions as F

# Row count
print("Rows:", sdf.count())

# Column names
print("Columns:", sdf.columns)

# Summary statistics
sdf.describe().show()

# Null values count
sdf.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in sdf.columns]).show()

# Specific columns select
sdf.select("reviewer_id", "rating").show(5)

# Rows filter karna (rating > 4)
sdf.filter(F.col("rating") > 4).show(5)


In [None]:
# # ============================================================
# # ALL-IN-ONE: Beginner Data Analysis (Pandas + PySpark)
# # Works in Google Colab. Toggle ENGINE and FILE_NAME only.
# # ============================================================

# # --------------------
# # 0) USER SETTINGS
# # --------------------
# ENGINE    = "pandas"     # "pandas" or "pyspark"
# SOURCE    = "github"     # keep "github" for your repo
# FILE_NAME = "user_reviews.csv"   # choose from FILES list below

# # --------------------
# # 1) FILE LIST / URLs
# # --------------------
# from urllib.parse import quote

# GITHUB_USER   = "theabhinaykumar"
# GITHUB_REPO   = "csv"
# GITHUB_BRANCH = "main"

# FILES = [
#     "New York 3,67,000+ Email.csv.xlsx",
#     "apps.csv",
#     "DB1 sales.csv",
#     "hospital.csv",
#     "AB_NYC_2019.csv",
#     "user_reviews.csv",
#     "creditcard.csv",
#     "creditcard 1.csv",
# ]

# def raw_url(user, repo, branch, filename):
#     return f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{quote(filename)}"

# URLS = {name: raw_url(GITHUB_USER, GITHUB_REPO, GITHUB_BRANCH, name) for name in FILES}

# # --------------------
# # 2) INSTALL / IMPORT
# # --------------------
# import sys, subprocess, os
# def pip_install(pkgs):
#     subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])

# # Always need pandas
# pip_install(["pandas", "openpyxl"])
# import pandas as pd

# # PySpark only if chosen
# if ENGINE.lower() == "pyspark":
#     pip_install(["pyspark==3.5.1"])
#     from pyspark.sql import SparkSession, functions as F

# print("✅ Engine:", ENGINE)
# print("📄 Using file:", FILE_NAME)
# print("🔗 RAW URL:", URLS.get(FILE_NAME, "(custom)"))

# # --------------------
# # 3) LOAD DATA
# # --------------------
# if ENGINE == "pandas":
#     # CSV vs Excel handled automatically by extension
#     if FILE_NAME.lower().endswith((".xlsx", ".xls")):
#         df = pd.read_excel(URLS[FILE_NAME], engine="openpyxl")
#     else:
#         df = pd.read_csv(URLS[FILE_NAME])
#     print("\n=== LOADED (pandas) ===")
#     print("Shape:", df.shape)
#     print("Columns:", list(df.columns))
#     print(df.head())

# else:  # PySpark
#     spark = (SparkSession.builder.appName("Beginner_All_In_One").getOrCreate())
#     print("\nSpark version:", spark.version)

#     # For Excel in GitHub: simple path = pandas -> spark
#     if FILE_NAME.lower().endswith((".xlsx", ".xls")):
#         pdf = pd.read_excel(URLS[FILE_NAME], engine="openpyxl")
#         sdf = spark.createDataFrame(pdf)
#     else:
#         sdf = (spark.read
#                .option("header", True)
#                .option("inferSchema", True)
#                .csv(URLS[FILE_NAME]))
#     print("\n=== LOADED (pyspark) ===")
#     print("Columns:", sdf.columns)
#     sdf.show(5, truncate=False)

# # --------------------
# # 4) BASIC ANALYSIS
# # --------------------
# if ENGINE == "pandas":
#     print("\n=== BASIC ANALYSIS (pandas) ===")
#     # Info
#     print("\n-- info() --")
#     print(df.info())

#     # Summary (numeric)
#     print("\n-- describe() --")
#     print(df.describe(include='all'))

#     # Null counts
#     print("\n-- null counts --")
#     print(df.isna().sum())

#     # Pick first numeric column (if any) for quick demos
#     num_col = None
#     for c in df.columns:
#         if pd.api.types.is_numeric_dtype(df[c]):
#             num_col = c
#             break

#     # Select first 2-3 columns safely
#     print("\n-- select first columns --")
#     print(df[df.columns[:3]].head())

#     # Filter (if numeric column exists)
#     if num_col:
#         mean_val = df[num_col].mean()
#         print(f"\n-- filter: {num_col} > mean ({mean_val:.3f}) --")
#         print(df[df[num_col] > mean_val].head())

#         # Sort
#         print(f"\n-- sort by {num_col} desc --")
#         print(df.sort_values(by=num_col, ascending=False).head())

#     # Save small outputs (optional)
#     df.head(100).to_csv("sample_output_pandas.csv", index=False)
#     print("\n💾 Saved: sample_output_pandas.csv")

# else:
#     print("\n=== BASIC ANALYSIS (pyspark) ===")
#     # Schema / count
#     print("\n-- schema --")
#     sdf.printSchema()
#     print("\n-- row count --")
#     print(sdf.count())

#     # Summary (numeric)
#     print("\n-- describe() --")
#     sdf.describe().show()

#     # Null counts
#     print("\n-- null counts --")
#     sdf.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in sdf.columns]).show()

#     # Pick first numeric column
#     num_col = None
#     for c, t in sdf.dtypes:
#         if t in ("int","bigint","double","float","long","decimal"):
#             num_col = c
#             break

#     # Select first columns
#     print("\n-- select first columns --")
#     sdf.select(*sdf.columns[:3]).show(5)

#     # Filter & sort (if numeric column exists)
#     if num_col:
#         avg_val = sdf.select(F.avg(F.col(num_col)).alias("avg")).first()["avg"]
#         print(f"\n-- filter: {num_col} > mean ({avg_val}) --")
#         sdf.filter(F.col(num_col) > avg_val).show(5)

#         print(f"\n-- sort by {num_col} desc --")
#         sdf.orderBy(F.col(num_col).desc()).show(5)

#     # Save small outputs (optional)
#     (sdf.limit(100).write.mode("overwrite").option("header", True).csv("sample_output_spark"))
#     print("\n💾 Saved folder: sample_output_spark/")

# # --------------------
# # 5) MINI PRACTICE (DO THESE)
# # --------------------
# print("\n🎯 PRACTICE (Beginner):")
# print("1) Change FILE_NAME to 'apps.csv' then run.")
# print("2) Change ENGINE to 'pyspark' then run.")
# print("3) Find the first numeric column and print its mean.")
# print("4) Filter rows where that numeric column > its mean.")
# print("5) Sort by that numeric column (descending) and show top 5.")
