In [3]:
from IPython.display import display, HTML
display(HTML('<style>pre { white-space: pre !important; }</style>'))


In [1]:
# Step 1: Read the CSV file
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Stocks Analysis") \
    .getOrCreate()

# Read the CSV file into a DataFrame
df = spark.read.csv("stocks.csv", header=True, inferSchema=True)

# Step 2: Find out about the schema of data
df.printSchema()

# Step 3: Select records with closing price less than 500
less_than_500_df = df.filter(df['Close'] < 500).select('Open', 'Close', 'Volume')
less_than_500_df.show()

# Step 4: Find out records with opening price more than 200 and closing price less than 200
more_than_200_less_than_200_df = df.filter((df['Open'] > 200) & (df['Close'] < 200))
more_than_200_less_than_200_df.show()

# Step 5: Extract the year from the date and save it in a new column
from pyspark.sql.functions import year

df_with_year = df.withColumn('Year', year(df['Date']))

# Step 6: For each year, show the minimum volumes traded
min_volume_by_year = df_with_year.groupBy('Year').min('Volume').withColumnRenamed('min(Volume)', 'minVolume')
min_volume_by_year.show()

# Step 7: For each year and month, show the highest low price
from pyspark.sql.functions import year, month, max

df_with_year_month = df.withColumn('Year', year(df['Date'])).withColumn('Month', month(df['Date']))
max_low_price_by_year_month = df_with_year_month.groupBy('Year', 'Month').agg(max('Low').alias('maxLow'))
max_low_price_by_year_month.show()

# Step 8: Calculate mean and standard deviation of high price over the whole data frame
from pyspark.sql.functions import mean, stddev

mean_high_price = df.select(mean('High')).collect()[0][0]
stddev_high_price = df.select(stddev('High')).collect()[0][0]
print("Mean High Price:", round(mean_high_price, 2))
print("Standard Deviation of High Price:", round(stddev_high_price, 2))

# Stop the SparkSession
spark.stop()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)
+------------------+------------------+---------+
|              Open|             Close|   Volume|
+------------------+------------------+---------+
|        213.429998|        214.009998|123432400|
|        214.599998|        214.379993|150476200|
|        214.379993|        210.969995|138040000|
|            211.75|            210.58|119282800|
|        210.299994|211.98000499999998|111902700|
|212.79999700000002|210.11000299999998|115557400|
|209.18999499999998|        207.720001|148614900|
|        207.870005|        210.650002|151473000|
|210.11000299999998|            209.43|108223500|
|210.92999500000002|            205.93|148516900|
|        208.330002|        215.039995|182501900|
|        214.910006|            211.7