In [1]:
pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("InstagramAnalysis").getOrCreate()

In [3]:
df = spark.read.csv("/content/drive/MyDrive/task data set/Instagram data.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- Impressions: integer (nullable = true)
 |-- From Home: integer (nullable = true)
 |-- From Hashtags: integer (nullable = true)
 |-- From Explore: integer (nullable = true)
 |-- From Other: integer (nullable = true)
 |-- Saves: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Shares: integer (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Profile Visits: integer (nullable = true)
 |-- Follows: integer (nullable = true)
 |-- Caption: string (nullable = true)
 |-- Hashtags: string (nullable = true)

+-----------+---------+-------------+------------+----------+-----+--------+------+-----+--------------+-------+--------------------+--------------------+
|Impressions|From Home|From Hashtags|From Explore|From Other|Saves|Comments|Shares|Likes|Profile Visits|Follows|             Caption|            Hashtags|
+-----------+---------+-------------+------------+----------+-----+--------+------+-----+--------------+-------+--------------------+---------

In [4]:
# Show summary statistics
df.describe().show()

# Count of nulls per column
from pyspark.sql.functions import col, isnan, when, count

df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-------+-----------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+
|summary|      Impressions|         From Home|     From Hashtags|      From Explore|       From Other|             Saves|         Comments|           Shares|             Likes|    Profile Visits|           Follows|             Caption|            Hashtags|
+-------+-----------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+
|  count|              119|               119|               119|               119|              119|               119|              119|              119|               119|               119|               119|               

In [5]:
df.orderBy(df.Likes.desc()).select("Caption", "Likes", "Impressions").show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+-----------+
|Caption                                                                                                                                                                                                                                                                                                                       |Likes|Impressions|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--

In [6]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

vec_assembler = VectorAssembler(inputCols=["Likes", "Follows"], outputCol="features")
df_vector = vec_assembler.transform(df).select("features")

correlation = Correlation.corr(df_vector, "features").head()[0]
print(f"Correlation Matrix:\n{correlation}")


Correlation Matrix:
DenseMatrix([[1.        , 0.74633317],
             [0.74633317, 1.        ]])


In [7]:
df.select("Impressions", "Likes", "Shares", "Follows", "Profile Visits").groupBy().avg().show()


+-----------------+------------------+-----------------+------------------+-------------------+
| avg(Impressions)|        avg(Likes)|      avg(Shares)|      avg(Follows)|avg(Profile Visits)|
+-----------------+------------------+-----------------+------------------+-------------------+
|5703.991596638655|173.78151260504202|9.361344537815127|20.756302521008404| 50.621848739495796|
+-----------------+------------------+-----------------+------------------+-------------------+



In [8]:
from pyspark.sql.functions import explode, split, lower, trim

hashtags_split = df.withColumn("Hashtag", explode(split(col("Hashtags"), "#")))
hashtags_clean = hashtags_split.withColumn("Hashtag", trim(lower(col("Hashtag"))))
hashtags_clean.groupBy("Hashtag").count().orderBy("count", ascending=False).show(10, truncate=False)

+-----------------------+-----+
|Hashtag                |count|
+-----------------------+-----+
|                       |119  |
|python�                |109  |
|amankharwal�           |107  |
|machinelearning�       |96   |
|pythonprogramming�     |95   |
|datascience�           |94   |
|ai�                    |91   |
|artificialintelligence�|89   |
|data�                  |88   |
|dataanalytics�         |87   |
+-----------------------+-----+
only showing top 10 rows



In [9]:
df_summary = df.select("Caption", "Likes", "Impressions")
df_summary.write.csv("insights_output", header=True)
