# Part 1

### Question 1 Answer:

In [None]:
import random
import numpy as np

In [None]:
def tokenize(sentence):
    return list(map(lambda x: x.lower(), sentence.split()))

In [None]:
def build_vocabulary(corpus):
    vocabulary = dict()
    for sentence in corpus:
        for word in tokenize(sentence):
            if word not in vocabulary:
                vocabulary[word] = dict()
    return vocabulary

In [None]:
def count_bigrams(vocabulary, corpus):
    for sentence in corpus:
        words = tokenize(sentence)
        for i in range(len(words) - 1):
            if words[i + 1] not in vocabulary[words[i]]:
                vocabulary[words[i]][words[i + 1]] = 1
            else:
                vocabulary[words[i]][words[i + 1]] += 1

In [None]:
def generate_sentence_by_probability():
    sentence = "<|start|> "
    first_word = "<|start|>"
    word = choose_next_word_by_probability(vocabulary, first_word)
    while word != "<|end|>":
        sentence += word + " "
        word = choose_next_word_by_probability(vocabulary, word)
    sentence += "<|end|>"
    return sentence

In [None]:
def choose_next_word_by_probability(vocabulary, word):
    # calculate probs with frequencies
    total = sum(vocabulary[word].values())
    probs = dict()
    for next_word in vocabulary[word]:
        probs[next_word] = vocabulary[word][next_word] / total
    # return a random word based on probs
    return random.choices(list(probs.keys()), list(probs.values()))[0]

In [None]:
with open('sentences.txt', 'r') as file:
    corpus = file.readlines()

vocabulary = build_vocabulary(corpus)

In [None]:
count_bigrams(vocabulary, corpus)

In [None]:
sentence1 = generate_sentence_by_probability()
print(sentence1)
sentence2 = generate_sentence_by_probability()
print(sentence2)
sentence3 = generate_sentence_by_probability()
print(sentence3)
sentence4 = generate_sentence_by_probability()
print(sentence4)
sentence5 = generate_sentence_by_probability()
print(sentence5)

<|start|> they enjoy playing the ukulele <|end|>
<|start|> you attend a picnic in the ukulele <|end|>
<|start|> we explore new bakeries <|end|>
<|start|> we take a new coffee shop every saturday <|end|>
<|start|> i practice drawing class every afternoon tea <|end|>


The sentences are not always meaningful. There could be certain generated sentences which can be in the in the input file.

In [None]:
with open('sentences.txt', 'r') as file:
    sentences = file.readlines()
    # lower sentences
    sentences = list(map(lambda x: x.lower().rstrip(), sentences))
    # check if sentences are in corpus
    print("is sentence 1 in sentences.txt: " + str(sentence1 in sentences))
    print("is sentence 2 in sentences.txt: " + str(sentence2 in sentences))
    print("is sentence 3 in sentences.txt: " + str(sentence3 in sentences))
    print("is sentence 4 in sentences.txt: " + str(sentence4 in sentences))
    print("is sentence 5 in sentences.txt: " + str(sentence5 in sentences))


is sentence 1 in sentences.txt: False
is sentence 2 in sentences.txt: False
is sentence 3 in sentences.txt: False
is sentence 4 in sentences.txt: False
is sentence 5 in sentences.txt: False


### Question 2 Answer:

$$P(w1,w2,w3,......,wk) = P(w1) * P(w2|w1) * P(w3|w2) * P(w4|w3) * ...... * P(wk|wk-1)$$

### Question 3 Answer:

In [None]:
sentence_to_prob = generate_sentence_by_probability()
print(sentence_to_prob)
random_choice = ""
with open('sentences.txt', 'r') as file:
    sentences = file.readlines()
    sentences = list(map(lambda x: x.lower().rstrip(), sentences))
    import random
    random_choice = random.choice(sentences)
print(random_choice)
random_sentence = "<|start|> You ride your galatasaray <|end|>".lower() # We took this from file randomly and changed the last word to galatasaray

<|start|> they enjoy a community group <|end|>
<|start|> we play sports together <|end|>


In [None]:
def calculate_prob(vocabulary : dict, sentence: str):
    probability = 0
    words = sentence.split()
    for i in range(len(words)-1):
      if words[i + 1] not in vocabulary:
        return 0
      total = sum(vocabulary[words[i]].values())
      probs = dict()
      for next_word in vocabulary[words[i]]:
        probs[next_word] = vocabulary[words[i]][next_word] / total
      probability += np.log(probs[words[i+1]])
    return np.exp(probability)

In [None]:
print(calculate_prob(vocabulary, sentence_to_prob)) # Our own sentence
print(calculate_prob(vocabulary, random_choice)) # Sentence already in file
print(calculate_prob(vocabulary, random_sentence)) # Edge case (the file does not contain 'galatasaray' word)

3.5744680851063814e-06
0.00015020537525354969
0


Generally the sentences in the original file are more likely to be created. Sentences we created are mostly have less chance to be created, but sometimes it is possible that they can be highly possible.

# Part 2

## Task 1

### Question 1 Answer

Exponential distribution with $β = 25$:


*   $μ = 25$
*   $\sigma = 25$
*   $\sigma^2 = 625$



### Question 2 Answer:

## Task 2

### Question 1 Answer:

$P(\text{Detect} \mid a, d) = \frac{P(a, d \mid \text{Detect}) P(\text{Detect})}{P(a, d)} = \frac{P(a \mid \text{Detect})P(d \mid \text{Detect})P(\text{Detect})}{P(a) P(d)} $

In [None]:
import pandas as pd
import numpy as np

In [None]:
def find_likelihood_function(data):
    mean = np.mean(data)
    std = np.std(data)
    return lambda x: (1 / (std * np.sqrt(2 * np.pi))) * np.exp(-((x - mean) ** 2) / (2 * std ** 2))

In [None]:
# calculate the posterior probability using Bayes' theorem
def calculate_posterior(distance, amplitude):
    # Likelihood for "Detect" case
    likelihood_detect = joint_likelihood_detect(distance, amplitude)
    # Likelihood for "No Detect" case
    likelihood_no_detect = joint_likelihood_no_detect(distance, amplitude)

    # Posterior for Detect
    posterior_detect = (prior_detect * likelihood_detect) / (prior_detect * likelihood_detect + prior_no_detect * likelihood_no_detect)
    # Posterior for No Detect
    posterior_no_detect = (prior_no_detect * likelihood_no_detect) / (prior_detect * likelihood_detect + prior_no_detect * likelihood_no_detect)

    return posterior_detect, posterior_no_detect

In [None]:
# Load the data
data = pd.read_csv('detection_data.csv')
extra_data = pd.read_csv('detection_data_extra.csv')

# split data as detect and no detect
detection = data[data['Detection'] == "Detect"]
no_detection = data[data['Detection'] == "No Detect"]

# calculate the distribution of distance and amplitude column separately
amplitude = data['Amplitude']
amplitude_detect = detection['Amplitude']
amplitude_no_detect = no_detection['Amplitude']

distance = data['Distance']
distance_detect = detection['Distance']
distance_no_detect = no_detection['Distance']

likelihood_amplitude = find_likelihood_function(amplitude) # p(a)
likelihood_amplitude_detect = find_likelihood_function(amplitude_detect) # p(a|detect)
likelihood_amplitude_no_detect = find_likelihood_function(amplitude_no_detect) # p(a|no detect)

likelihood_distance = find_likelihood_function(distance) # p(d)
likelihood_distance_detect = find_likelihood_function(distance_detect) # p(d|detect)
likelihood_distance_no_detect = find_likelihood_function(distance_no_detect) # p(d|no detect)

joint_likelihood = lambda distance, amplitude: likelihood_distance(distance) * likelihood_amplitude(amplitude) # p(a, d)
joint_likelihood_detect = lambda distance, amplitude: likelihood_distance_detect(distance) * likelihood_amplitude_detect(amplitude) # p(a, d|detect)
joint_likelihood_no_detect = lambda distance, amplitude: likelihood_distance_no_detect(distance) * likelihood_amplitude_no_detect(amplitude) # p(a, d|no detect)

# calculate the prior probability
prior_detect = len(detection) / len(data)
prior_no_detect = len(no_detection) / len(data)

### Question 2 Answer:

In [None]:
# calculate the posterior probability for the extra data
posterior = data.apply(lambda x: calculate_posterior(x['Distance'], x['Amplitude']), axis=1)

# detect if the posterior probability is greater than 0.5
prediction = posterior.apply(lambda x: "Detect" if x[0] > 0.5 else "No Detect")

# calculate the accuracy
actual = data['Detection']
accuracy = np.mean(actual == prediction)
print("Accuracy:", accuracy)

# print the prediction
print(posterior)


Accuracy: 0.9
0       (0.024942321647782987, 0.975057678352217)
1      (0.9975268294224732, 0.002473170577526815)
2     (0.9995446346522848, 0.0004553653477151851)
3       (0.9457115372216279, 0.05428846277837202)
4       (0.8001390857703577, 0.19986091422964225)
                         ...                     
95     (0.006572186505595189, 0.9934278134944049)
96      (0.03863183550371815, 0.9613681644962818)
97     (0.9881241376884649, 0.011875862311535193)
98       (0.32454546747434604, 0.675454532525654)
99       (0.05792915113793698, 0.942070848862063)
Length: 100, dtype: object


### Question 3 Answer:

In [None]:
# calculate the posterior probability for the extra data
posterior = extra_data.apply(lambda x: calculate_posterior(x['Distance'], x['Amplitude']), axis=1)

# detect if the posterior probability is greater than 0.5
prediction = posterior.apply(lambda x: "Detect" if x[0] > 0.5 else "No Detect")

# calculate the accuracy
actual = extra_data['Detection']
accuracy = np.mean(actual == prediction)
print("Accuracy:", accuracy)

# print the prediction
print(posterior)

Accuracy: 0.81
0     (0.0041774234133274335, 0.9958225765866725)
1       (0.28255064142403863, 0.7174493585759614)
2        (0.7992364217293724, 0.2007635782706276)
3     (0.9970778593080465, 0.0029221406919534244)
4        (0.5567479946379642, 0.4432520053620359)
                         ...                     
95     (0.016028412649477117, 0.9839715873505229)
96       (0.0850321404214792, 0.9149678595785208)
97      (0.20423890981812085, 0.7957610901818791)
98     (0.9903716183056595, 0.009628381694340545)
99      (0.015446054947018999, 0.984553945052981)
Length: 100, dtype: object


# Part 3

## Question 1 Answer:

$$E[g(x)] = \int_{-\infty}^\infty g(x) f(x; \mu, \Sigma) \, dx$$


$$g(x) = 0.1x_1^2 + 12.5x_2^2 - 7.5x_3^2
$$

$f(x;μ,Σ)$ is the probability density function of the multivariate normal distribution.

## Question 2 Answer:

In [None]:
import numpy as np

In [None]:
# set seed 42


In [None]:
# Parameters of the multivariate normal distribution
mu = np.array([20, 0.3, 0.8])  # Mean vector
cov = np.array([[4, 0.5, 0.2],  # Covariance matrix
                [0.5, 0.7, 0.1],
                [0.2, 0.1, 0.2]])

In [None]:
# Define the function g(x)
def g(x):
    return 0.1 * x[0]**2 + 12.5 * x[1]**2 - 7.5 * x[2]**2

In [None]:
# Monte Carlo sampling function
def monte_carlo_estimation(n_samples):
    samples = np.random.multivariate_normal(mu, cov, n_samples)
    y = np.apply_along_axis(g, 1, samples)
    mean_g = np.mean(y)
    std_err = np.std(y, ddof=1) / np.sqrt(n_samples)
    confidence_interval = (mean_g - 1.96 * std_err, mean_g + 1.96 * std_err)
    return mean_g, confidence_interval

In [None]:
# Run Monte Carlo for different sample sizes
sample_sizes = [50, 100, 1000, 10000]
results = {}

for n in sample_sizes:
    mean, ci = monte_carlo_estimation(n)
    results[n] = (mean, ci)
    print(f"Sample Size: {n}, Mean: {mean:.3f}, 95% CI: ({ci[0]:.3f}, {ci[1]:.3f})")

Sample Size: 50, Mean: 46.521, 95% CI: (41.157, 51.886)
Sample Size: 100, Mean: 44.510, 95% CI: (40.748, 48.272)
Sample Size: 1000, Mean: 43.852, 95% CI: (42.852, 44.853)
Sample Size: 10000, Mean: 44.095, 95% CI: (43.766, 44.424)


## Question 3 Answer:

In [None]:
import numpy as np

In [None]:
n0, n1 = 10000, 50

# Assuming variance is pooled
samples_g0 = np.random.multivariate_normal(mu, cov, n0)
samples_g1 = np.random.multivariate_normal(mu, cov, n1)

var_g0 = np.var(np.apply_along_axis(g, 1, samples_g0), ddof=1)
var_g1 = np.var(np.apply_along_axis(g, 1, samples_g1), ddof=1)

mean_g0 = np.mean(np.apply_along_axis(g, 1, samples_g0))
mean_g1 = np.mean(np.apply_along_axis(g, 1, samples_g1))

pooled_std = np.sqrt(var_g0 / n0 + var_g1 / n1)
t_statistic = (mean_g0 - mean_g1) / pooled_std

# Degrees of freedom for the two-sample t-test
df = min(n0 - 1, n1 - 1)
t_critical = 1.96 # significance level --> 0.05

# Decision
if abs(t_statistic) > t_critical:
    decision = "Reject H0: g0 != g1"
else:
    decision = "Fail to Reject H0: g0 = g1"

print(f"T-statistic: {t_statistic:.3f}, T-critical: {t_critical:.3f}")
print(f"Decision: {decision}")


T-statistic: 0.108, T-critical: 1.960
Decision: Fail to Reject H0: g0 = g1


In [None]:
!jupyter nbconvert --to html /content/CMPE343.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr