test

# 1 task

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import t

# Manually input the dataset
data = {
    "Case": list(range(1, 17)),
    "Stratum": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4],
    "Cluster": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8],
    "Variable": [11.5, 29, 35.6, 64.7, 19.2, 20.5, 37.1, 92, 88.3, 78.4, 65.3, 55.2, 85.3, 15.7, 44.5, 67.3]
}

df = pd.DataFrame(data)

# Extract relevant column
variable_values = df['Variable']
n = len(variable_values)

# 1) Compute mean for Simple Random Sampling (SRS)
mean_srs = np.round(np.mean(variable_values), 2)

# 2) Compute standard error for SRS
std_dev = np.std(variable_values, ddof=1)  # Sample standard deviation
se_srs = np.round(std_dev / np.sqrt(n), 2)

# 3) Compute 95% confidence interval
confidence_level = 0.95
t_value = 2.04  # Given in the question
margin_of_error = np.round(t_value * se_srs, 2)
upper_limit_srs = np.round(mean_srs + margin_of_error, 2)
lower_limit_srs = np.round(mean_srs - margin_of_error, 2)

# Clustering Random Sampling
cluster_means = df.groupby('Cluster')['Variable'].mean()
cluster_sample_size = df.groupby('Cluster')['Variable'].count()

# 4) Compute mean for Clustering Random Sampling
mean_cluster = np.round(cluster_means.mean(), 2)

# 5) Compute standard error for Clustering Random Sampling
se_cluster = np.round(np.std(cluster_means, ddof=1) / np.sqrt(len(cluster_means)), 2)

# 6) Compute d-value
d_value = np.round(se_cluster / se_srs, 2)

# 7) Compute d-squared
d_squared = np.round(d_value ** 2, 2)

# 8) Compute roh
roh = np.round((d_squared - (1 / n)) / (1 - (1 / n)), 2)

# 9) Compute effective sample size (N_eff)
W_d = 0.125  # Given in the hint
S_c = 8  # Number of clusters
N_eff = np.round(n / (1 + (n - 1) * W_d), 2)

# Display results
results = {
    "Mean (SRS)": mean_srs,
    "Standard Error (SRS)": se_srs,
    "95% CI Upper (SRS)": upper_limit_srs,
    "95% CI Lower (SRS)": lower_limit_srs,
    "Mean (Clustering)": mean_cluster,
    "Standard Error (Clustering)": se_cluster,
    "d-value": d_value,
    "d-squared": d_squared,
    "roh": roh,
    "Effective Sample Size (N_eff)": N_eff
}

for key, value in results.items():
    print(f"{key}: {value}")


Mean (SRS): 50.6
Standard Error (SRS): 6.89
95% CI Upper (SRS): 64.66
95% CI Lower (SRS): 36.54
Mean (Clustering): 50.6
Standard Error (Clustering): 7.62
d-value: 1.11
d-squared: 1.23
roh: 1.25
Effective Sample Size (N_eff): 5.57


# 2 task

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

# Upload the file
uploaded = files.upload()

# Load dataset (assuming the first uploaded file is the dataset)
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

# Extract features and target variable
X = df[['X1', 'X2', 'X1^2', 'X1^3', 'X2^2', 'X2^3', 'X1*X2', 'X1^2*X2']].values
Y = df[['Y']].values

# Feature normalization (Z-score normalization)
def feature_normalize(X):
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0)
    X_norm = (X - mu) / sigma
    return X_norm, mu, sigma

X, mu, sigma = feature_normalize(X)

# Add bias term
X = np.c_[np.ones(X.shape[0]), X]

# Initialize theta parameters to zero
theta = np.zeros((X.shape[1], 1))

# Gradient Descent Function
def compute_cost(X, Y, theta):
    m = len(Y)
    predictions = X @ theta
    cost = (1 / (2 * m)) * np.sum((predictions - Y) ** 2)
    return cost

def gradient_descent(X, Y, theta, alpha, iterations):
    m = len(Y)
    cost_history = []

    for _ in range(iterations):
        theta -= (alpha / m) * (X.T @ (X @ theta - Y))
        cost_history.append(compute_cost(X, Y, theta))

    return theta, cost_history

# Training the model with different iterations
iterations_list = [10, 100, 1000]
learning_rate = 0.1

results = []
for n in iterations_list:
    theta_opt, cost_history = gradient_descent(X, Y, np.zeros((X.shape[1], 1)), learning_rate, n)
    max_theta = np.max(np.abs(theta_opt))
    final_cost = round(cost_history[-1])
    results.append((n, final_cost, round(max_theta)))

# Display results
df_results = pd.DataFrame(results, columns=['Iterations', 'Cost Function', 'Max Theta'])
print(df_results)


Saving Question2_Dataset.csv to Question2_Dataset.csv
   Iterations  Cost Function  Max Theta
0          10         895241       2167
1         100          42271       3328
2        1000           1261       3328


# 3 task

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from google.colab import files
import io

# Upload file in Colab
uploaded = files.upload()
file_name = list(uploaded.keys())[0]  # Get the uploaded file name

# Load dataset
df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Separate input and output features
X = df.iloc[:, :-1].values  # Input features
y = df.iloc[:, -1].values   # Output feature

# Apply normalization
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Function to train logistic regression and compute cost function
def train_logistic_regression(X, y, N, alpha, lambda_):
    model = LogisticRegression(penalty='l2', C=1/lambda_, solver='lbfgs', max_iter=N)
    model.fit(X, y)

    # Compute cost function (log loss)
    from sklearn.metrics import log_loss
    y_pred_proba = model.predict_proba(X)[:, 1]
    cost = log_loss(y, y_pred_proba)

    # Get optimal theta parameters
    theta = np.hstack((model.intercept_, model.coef_.flatten()))
    max_theta = np.round(np.max(theta), 2)

    return np.round(cost, 2), theta, max_theta

# Run iterations
iterations = [(100, 0.1, 0.1), (1000, 0.2, 1), (10000, 0.3, 10)]
results = {}

for i, (N, alpha, lambda_) in enumerate(iterations, start=1):
    cost, theta, max_theta = train_logistic_regression(X_normalized, y, N, alpha, lambda_)
    results[f"Iteration {i}"] = {"Cost": cost, "Max Theta": max_theta}

# Run prediction for final iteration
final_model = LogisticRegression(penalty='l2', C=1/10, solver='lbfgs', max_iter=10000)
final_model.fit(X_normalized, y)
predictions = (final_model.predict_proba(X_normalized)[:, 1] >= 0.5).astype(int)
num_ones = np.sum(predictions[:10])

# Print results
for key, value in results.items():
    print(f"{key}: Cost={value['Cost']}, Max Theta={value['Max Theta']}")
print(f"Number of ones in first 10 rows: {num_ones}")


Saving Question3_Final_CP.csv to Question3_Final_CP.csv
Iteration 1: Cost=0.04, Max Theta=5.4
Iteration 2: Cost=0.11, Max Theta=2.45
Iteration 3: Cost=0.24, Max Theta=1.0
Number of ones in first 10 rows: 6


# 4 task

a4 = [0.950, 0.050]

a3.min() = 0.050

W4.max() = 0.45

W3.min() = 0.18

Loss after 10000 epochs = 0.050

NNN cannot define class


# 5 task

In [7]:
import numpy as np

# Given confusion matrix
conf_matrix = np.array([
    [30, 20, 10],  # Predicted as a
    [50, 60, 10],  # Predicted as b
    [20, 20, 80]   # Predicted as c
])

# Total number of samples
total_samples = np.sum(conf_matrix)

# Correct predictions (diagonal elements)
correct_predictions = np.trace(conf_matrix)

# Accuracy calculation
accuracy = correct_predictions / total_samples

# Precision, Recall, and F1-score calculations
precision = np.diag(conf_matrix) / np.sum(conf_matrix, axis=0)
recall = np.diag(conf_matrix) / np.sum(conf_matrix, axis=1)
f1_score = 2 * (precision * recall) / (precision + recall)

# Round results to 3 decimal places
accuracy = round(accuracy, 3)
precision = np.round(precision, 3)
recall = np.round(recall, 3)
f1_score = np.round(f1_score, 3)

# Display results
print(f"Accuracy: {accuracy}")
print(f"Precision for classes a, b, c: {precision}")
print(f"Recall for classes a, b, c: {recall}")
print(f"F1-score for classes a, b, c: {f1_score}")


Accuracy: 0.567
Precision for classes a, b, c: [0.3 0.6 0.8]
Recall for classes a, b, c: [0.5   0.5   0.667]
F1-score for classes a, b, c: [0.375 0.545 0.727]


# 6 task

A: Borrowing and Returning Books

B: Fine Management

C: Reports and Analytics

D: Performance Requirements

E: Security Requirements

F: Usability Requirements


# 7 task

In [11]:
import pandas as pd
import numpy as np
from google.colab import files

# Step 1: Upload CSV file
print("Please upload your CSV file:")
uploaded = files.upload()

# Get the uploaded file name
file_name = list(uploaded.keys())[0]

# Load the dataset
df = pd.read_csv(file_name)

# Ensure the required column exists
if "Age Group" not in df.columns:
    raise ValueError("Error: Column 'Age Group' not found in CSV file.")

# Compute counts for the "Age Group" category
category_counts = df["Age Group"].value_counts()
total = category_counts.sum()

# Compute proportions
summary_df = pd.DataFrame({
    "Category": category_counts.index,
    "Count": category_counts.values,
    "Proportion": (category_counts / total).round(3)
})

# Compute Standard Error (SE)
summary_df['Standard Error'] = (np.sqrt((summary_df['Proportion'] * (1 - summary_df['Proportion'])) / total)).round(3)

# Apply Design Effect (DEFF)
rho = 0.02  # Intraclass correlation coefficient
m = 5  # Average cluster size
deff_clustering = 1 + rho * (m - 1)

# Compute Adjusted SE (Clustering)
summary_df['Adjusted SE (Clustering)'] = (summary_df['Standard Error'] * np.sqrt(deff_clustering)).round(3)

# Compute 95% Confidence Intervals (CI)
z_value = 1.96  # Z-score for 95% confidence level
summary_df['Lower CI (Simple)'] = (summary_df['Proportion'] - z_value * summary_df['Standard Error']).round(3)
summary_df['Upper CI (Simple)'] = (summary_df['Proportion'] + z_value * summary_df['Standard Error']).round(3)
summary_df['Lower CI (Clustering)'] = (summary_df['Proportion'] - z_value * summary_df['Adjusted SE (Clustering)']).round(3)
summary_df['Upper CI (Clustering)'] = (summary_df['Proportion'] + z_value * summary_df['Adjusted SE (Clustering)']).round(3)
summary_df['DEFF (Clustering)'] = round(deff_clustering, 3)

# Filter results for age groups "18-24" and "55-64"
filtered_summary_df = summary_df[summary_df["Category"].isin(["18-24", "55-64"])]

# Display results
print("\nFiltered Age Group Analysis:")
print(filtered_summary_df)


Please upload your CSV file:


Saving Question7_Final_CP.csv to Question7_Final_CP (1).csv

Filtered Age Group Analysis:
          Category  Count  Proportion  Standard Error  \
Age Group                                               
18-24        18-24    227       0.151           0.009   
55-64        55-64    212       0.141           0.009   

           Adjusted SE (Clustering)  Lower CI (Simple)  Upper CI (Simple)  \
Age Group                                                                   
18-24                         0.009              0.133              0.169   
55-64                         0.009              0.123              0.159   

           Lower CI (Clustering)  Upper CI (Clustering)  DEFF (Clustering)  
Age Group                                                                   
18-24                      0.133                  0.169               1.08  
55-64                      0.123                  0.159               1.08  


# 8 task

1. **`private String strCustomerName;`** → **Hungarian**  
2. **`public Customer(int customerlD, String customerName, String email)`** → **Pascal**  
3. **`public String getEmail() { return strEmail; }`** → **Hungarian**  
4. **`private String strSKU;`** → **Acronym**  
5. **`private Product prodltem;`** → **Hungarian**  
6. **`public void AddOrderltem(Orderltem orderltem)`** → **Pascal**  
7. **`private String strOTP;`** → **Acronym**  

# 9 task

A: Stability Testing

B: Performance Testing

class TestOrder: Stress Testing

test_order_under_repeated_processing: Stress Testing

# 10 task

A: 0k Case

B: Missing Semicolon

C: Incorrect Comparison Index

D: Index Out of Range Bug

E: Undefined Variable

F: Incorrect Loop Condition

G: Missing Semicolon

H: 0k Case