In [None]:
#Pyspark

pip install pyspark

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression

# Step 1: Create a Spark session
spark = SparkSession.builder.appName("LogisticRegressionExample").getOrCreate()

# Step 2: Load the dataset
data = spark.read.csv("C:/Users/RITHVI/Desktop/pred_final/Log_Reg_dataset.csv", header=True, inferSchema=True)

# Step 3: Perform descriptive statistics
data.describe().show()

# Step 4: Assign probability values (0 and 1) for class 0 and class 1 in 'Status' column
data = data.withColumn("Status", data["Status"].cast("double"))

# Step 5: Convert categorical columns to numeric using StringIndexer
indexers = [
    StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
    for col in ["Country", "Platform"]
]
indexers.append(StringIndexer(inputCol="Repeat_Visitor", outputCol="Repeat_Visitor_index", handleInvalid="keep"))

# Fit and transform the data with the StringIndexer
for indexer in indexers:
    data = indexer.fit(data).transform(data)

# Step 6: Prepare data for logistic regression
# Select features and label columns
feature_cols = ["Country_index", "Age", "Repeat_Visitor_index", "Platform_index", "Web_pages_viewed"]
label_col = "Status"

# Create a vector assembler to combine features into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

# Step 7: Split the data into training and test datasets
(training_data, test_data) = data.randomSplit([0.8, 0.2], seed=42)

# Step 8: Build and fit the logistic regression model
lr = LogisticRegression(labelCol=label_col)
lr_model = lr.fit(training_data)

# Step 9: Make predictions on the test dataset
predictions = lr_model.transform(test_data)

# Confusion matrix
conf_matrix = predictions.groupBy("Status", "prediction").count()
print("Confusion Matrix:")
conf_matrix.show()

# Precision, recall, and F1 score
TP = predictions[(predictions.Status == 1) & (predictions.prediction == 1)].count()
TN = predictions[(predictions.Status == 0) & (predictions.prediction == 0)].count()
FP = predictions[(predictions.Status == 0) & (predictions.prediction == 1)].count()
FN = predictions[(predictions.Status == 1) & (predictions.prediction == 0)].count()

# Compute classification metrics
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

In [None]:
#Superstore

import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error


# Load data
df = pd.read_excel('/content/Superstore.xls')

office_supplies = df[df['Category'] == 'Office Supplies']
office_supplies['Order Date'] = pd.to_datetime(office_supplies['Order Date'])
# Monthly sales
monthly_sales = office_supplies.groupby(pd.Grouper(key='Order Date', freq='M')).sum()['Sales']

# a. Stationarity check
plt.plot(monthly_sales, label='Original')
plt.plot(monthly_sales.rolling(window=12).mean(), label='Rolling Mean')
plt.plot(monthly_sales.rolling(window=12).std(), label='Rolling Std')
plt.legend()
plt.title('Rolling Mean & Standard Deviation')
plt.show()

# Augmented Dickey-Fuller test
result = adfuller(monthly_sales)
print('ADF Statistic (a):', result[0])
print('p-value (a):', result[1])
print('Critical Values (a):', result[4])

# b. Determine order of differencing, d
d = 1

plt.plot(monthly_sales)
plt.title('Monthly Sales')
plt.xlabel('Order Date')
plt.ylabel('Sales')
plt.show()

# c. Determine order p for AR(p)
plt.figure(figsize=(10, 6))
plot_pacf(monthly_sales, lags=min(24, len(monthly_sales)-1), alpha=0.05)
plt.title('Partial Autocorrelation Function (PACF)')
plt.show()

# Assuming p = 2
p = 2

# d. Determine order q for MA(q)
plt.figure(figsize=(10, 6))
plot_acf(monthly_sales, lags=30, alpha=0.05)
plt.title('Autocorrelation Function (ACF)')
plt.show()

# Assuming q = 2
q = 2

# e. Fit ARIMA model and forecast
model = ARIMA(monthly_sales, order=(p, d, q))
model_fit = model.fit()

# Forecast
forecast = model_fit.forecast(steps=len(monthly_sales))

# Plot original data and forecast
plt.plot(monthly_sales, label='Original')
plt.plot(monthly_sales.index, forecast, label='Forecast', linestyle='--')
plt.legend()
plt.title('ARIMA Forecast')
plt.show()

# Evaluate model accuracy
mse = mean_squared_error(monthly_sales, forecast)
print('Mean Squared Error (MSE) (e):', mse)

In [None]:
#Spam
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("C:/Users/RITHVI/Desktop/pred_final/Spam.csv", encoding='latin-1')

df = df[['v1', 'v2']]
df.columns = ['label', 'message']

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("C:/Users/RITHVI/Desktop/pred_final/Spam.csv", encoding='latin-1')

df = df[['v1', 'v2']]
df.columns = ['label', 'message']

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df['message'])


y = df['label']

clf = MultinomialNB()
clf.fit(x, y)

def evaluate_classifier(clf, x_test, y_test):
    y_pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='spam')
    recall = recall_score(y_test, y_pred, pos_label='spam')
    f1 = f1_score(y_test, y_pred, pos_label='spam')
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = evaluate_classifier(clf, x, y)
print(f"Accuracy: {accuracy:.2f}")
print(f"precision: {precision:.2f}")
print(f"recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
#India exchange rate
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

data = pd.read_excel("C:/Users/RITHVI/Downloads/India_Exchange_Rate_Dataset.xls")

data['EXINUS'].plot(figsize=(12, 6), title='India Exchange Rate')
plt.show()

data['EXINUS'].rolling(window=12).mean().plot(figsize=(12, 6), title='India Exchange Rate with 12-month SMA')
plt.show()

data['EXINUS'].ewm(span=12).mean().plot(figsize=(12, 6), title='India Exchange Rate with 12-month EWMA')
plt.show()

result = adfuller(data['EXINUS'])
print("Adfuller statistics:", result[0])
print("p-value:", result[1])
#print(f'ADF Statistic: {result[0]}, p-value: {result[1]}')

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
plot_acf(data['EXINUS'], lags=30, ax=ax1)
plot_pacf(data['EXINUS'], lags=30, ax=ax2)
plt.show()

In [None]:
#r programming
#2
install.packages("readr")
install.packages("dplyr")
install.packages("Hmisc")
install.packages("ggplot2")
install.packages("datarium")
install.packages("caret")
library(readr)#reading a csv file
library(dplyr)#data wrangling
library(Hmisc)#data description
library(ggplot2)#data visualization
library(datarium)#dataset
library(caret)#macine learning library for splitting on training and test
data("marketing", package="datarium")
marketing_plan<-marketing
marketing_plan

#1
marketing_plan %>% ggplot(aes(x = youtube, y = sales)) +  geom_point() +
  labs(x = "Spending on YouTube ads",y = "Sales", title = "Graph 1: Relationship between YouTube ads and sales") +  stat_smooth(se = FALSE) +   theme(panel.background = element_rect(fill = "white", colour = "grey50"))
marketing_plan %>% ggplot(aes(x = facebook, y = sales)) +  geom_point() +
  labs(x = "facebook",y = "Sales", title = "Graph 2: Relationship between facebook and sales") +  stat_smooth(se = FALSE) +   theme(panel.background = element_rect(fill = "white", colour = "grey50"))
marketing_plan %>% ggplot(aes(x = newspaper, y = sales)) +  geom_point() +
  labs(x = "newspaper",y = "Sales", title = "Graph 3: Relationship between newspaper and sales") +  stat_smooth(se = FALSE) +   theme(panel.background = element_rect(fill = "white", colour = "grey50"))

#4
set.seed(1)
train_indices<-createDataPartition(y=marketing[["sales"]],
                                   p=0.8,
                                   list = FALSE)
train_listings<-marketing[train_indices,]
test_listings<-marketing[-train_indices,]

#3
#observation
#we can see that the p value for youtube and facebook is extremely small,
#which means that we reject the null hypothesis that the youtube and facbook donot impact sales
#on the other hand, p value for newspaper is greater than 0.05,
#which means it is not a significant value. we fail to reject the null hypothesis
#that there is any significant relationship between newspaper ads and sales.
#I will create second model to exclude the variable newspaper.

#5
#case1:considering youtube,facebook,newspaper versus sales
model_0<-lm(sales~youtube+facebook+newspaper,data=train_listings)
summary(model_0)
model_1<-lm(sales~youtube+facebook,data=train_listings)
summary(model_1)
model_2<-lm(sales~facebook+I(facebook^2)+youtube+I(youtube^2),data=train_listings)
summary(model_2)
model_3<-lm(sales~facebook+poly(youtube,5),data=train_listings)
summary(model_3)
model_4<-lm(sales~facebook+poly(youtube,3)+facebook*youtube,data=train_listings)
summary(model_4)
