In [5]:
import os
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [7]:
from tensorflow import keras

from keras.models import Model, load_model
from keras.layers import GlobalAveragePooling2D, Dense, Dropout

In [23]:
def unique_names(names1, names2):
    return list(set(names1 + names2))

unique_names(['Ava', 'Emma', 'Olivia'], ['Olivia', 'Sophia', 'Emma'])

['Sophia', 'Ava', 'Olivia', 'Emma']

---

## Solution to the Sentiment Analysis Classifier Problem

Description:

You are given a dataset of movie reviews and their corresponding sentiment labels (positive or negative). Your task is to implement a simple sentiment analysis classifier using Python. You should use a basic machine learning model (like Logistic Regression) and preprocess the text data using Natural Language Processing (NLP) techniques.

In [24]:
reviews = [
    "I loved this movie, it was fantastic!",
    "Absolutely terrible movie, would not recommend.",
    "The plot was decent, but the acting was poor.",
    "An outstanding experience, truly a masterpiece!",
    "This was the worst film I have ever seen."
]

labels = [1, 0, 0, 1, 0]

test_reviews = [
    "I enjoyed watching this movie.",
    "The movie was a waste of time."
]

# Expected Output: Accuracy score on the test dataset (e.g., 0.5)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Create a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Split the data into X_train and X_test
X_train, X_val, y_train, y_val = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Fit and transform the training data
X_tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation data
X_tfidf_val = tfidf_vectorizer.transform(X_val)

print(X_tfidf_train.shape, X_tfidf_val.shape)

# Transform the test set
X_tfidf_test = tfidf_vectorizer.transform(test_reviews)
print(X_tfidf_test.shape)

# Print the features
print(tfidf_vectorizer.get_feature_names_out(), len(tfidf_vectorizer.get_feature_names_out()))

(4, 22) (1, 22)
(2, 22)
['acting' 'an' 'but' 'decent' 'ever' 'experience' 'fantastic' 'film'
 'have' 'it' 'loved' 'masterpiece' 'movie' 'outstanding' 'plot' 'poor'
 'seen' 'the' 'this' 'truly' 'was' 'worst'] 22


In [45]:
# Logistic regression on the training data

from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model
lr = LogisticRegression()

# Fit the model
lr.fit(X_tfidf_train, y_train) 

# Make predictions
val = lr.predict(X_tfidf_val)
print("valuation:", val, "y_val: " , y_val)

preds = lr.predict(X_tfidf_test)

print(preds == [1, 0])

# Calculate the accuracy
accuracy = (preds == [1, 0]).mean()
print("accuracy: ",accuracy)

valuation: [1] y_val:  [0]
[ True  True]
accuracy:  1.0


## Problem: Text Classification with Naive Bayes

Description:

You are given a dataset of SMS messages labeled as either "spam" or "ham" (non-spam). Your task is to implement a text classification system using Python to identify whether a given SMS message is spam or not. You will use the Naive Bayes classifier, which is commonly used for text classification tasks.

In [1]:
messages = [
    "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.",
    "Nah I don't think he goes to usf, he lives around here though.",
    "FreeMsg Hey there darling it's been 3 weeks now and no word back!",
    "Even my brother is not like to speak with me. They treat me like aids patent.",
    "WINNER!! As a valued network customer you have been selected to receive a £900 prize reward!"
]

labels = [1, 0, 1, 0, 1]

test_messages = [
    "Congratulations, you have won a $1000 Walmart gift card. Go to http://bit.ly/123456 to claim now.",
    "I'll text you when I'm done. See you later."
]

# Expected Output: Accuracy score on the test dataset (e.g., 0.5)


In [61]:
# Create a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Split the data into X_train and X_test
# X_train, X_val, y_train, y_val = train_test_split(messages, labels, test_size=0.2, random_state=42)

# Fit and transform the training data
X_tfidf_train = tfidf_vectorizer.fit_transform(messages)

# Transform the validation data
# X_tfidf_val = tfidf_vectorizer.transform(X_val)

print(X_tfidf_train.shape)

# Transform the test set
X_tfidf_test = tfidf_vectorizer.transform(test_messages)
print(X_tfidf_test.shape)

# Print the features
print(tfidf_vectorizer.get_feature_names_out(), len(tfidf_vectorizer.get_feature_names_out()))

(5, 61)
(2, 61)
['2005' '21st' '900' 'aids' 'and' 'around' 'as' 'back' 'been' 'brother'
 'comp' 'cup' 'customer' 'darling' 'don' 'entry' 'even' 'fa' 'final'
 'free' 'freemsg' 'goes' 'have' 'he' 'here' 'hey' 'in' 'is' 'it' 'like'
 'lives' 'may' 'me' 'my' 'nah' 'network' 'no' 'not' 'now' 'patent' 'prize'
 'receive' 'reward' 'selected' 'speak' 'there' 'they' 'think' 'though'
 'tkts' 'to' 'treat' 'usf' 'valued' 'weeks' 'win' 'winner' 'with' 'wkly'
 'word' 'you'] 61


In [62]:
# Logistic regression on the training data

lr = LogisticRegression()

# Fit the model
lr.fit(X_tfidf_train, labels)

# Make predictions

preds = lr.predict(X_tfidf_test)

print(preds, ";", preds == [1, 0])

# Calculate the accuracy
accuracy = (preds == [1, 0]).mean()
print("accuracy: ",accuracy)

[1 1] ; [ True False]
accuracy:  0.5


### another approach using CountVectorizer and NB

In [22]:
print(string.punctuation)
messages[0]

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.'

In [21]:
''.join([char for char in messages[0] if char not in string.punctuation])

'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005'

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import string

def clean_text(text):
    """ 
    Converts text to lowercase, removes punctuation and digits, and replaces
    consecutive whitespaces by a single space.
    """
    text = text.lower()  # Convert text to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    return text

# Clean the training messages
cleaned_messages = [clean_text(msg) for msg in messages]
print(cleaned_messages)

['free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005', 'nah i dont think he goes to usf he lives around here though', 'freemsg hey there darling its been 3 weeks now and no word back', 'even my brother is not like to speak with me they treat me like aids patent', 'winner as a valued network customer you have been selected to receive a £900 prize reward']


In [3]:
# instantiate count vectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the training data

X_count_train = count_vectorizer.fit_transform(cleaned_messages)

# Transform the test set
cleaned_test_messages = [clean_text(msg) for msg in test_messages]
X_count_test = count_vectorizer.transform(cleaned_test_messages)

print(X_count_train.shape, X_count_test.shape)
print(count_vectorizer.get_feature_names_out(), len(count_vectorizer.get_feature_names_out()))
print(X_count_train.toarray())

(5, 61) (2, 61)
['2005' '21st' '900' 'aids' 'and' 'around' 'as' 'back' 'been' 'brother'
 'comp' 'cup' 'customer' 'darling' 'dont' 'entry' 'even' 'fa' 'final'
 'free' 'freemsg' 'goes' 'have' 'he' 'here' 'hey' 'in' 'is' 'its' 'like'
 'lives' 'may' 'me' 'my' 'nah' 'network' 'no' 'not' 'now' 'patent' 'prize'
 'receive' 'reward' 'selected' 'speak' 'there' 'they' 'think' 'though'
 'tkts' 'to' 'treat' 'usf' 'valued' 'weeks' 'win' 'winner' 'with' 'wkly'
 'word' 'you'] 61
[[1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 2 1 0 0 0 0 0 1 0 0 0 1 0
  0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
  1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 2 0 0 2 1 0 0
  0 1 0 1 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0]
 [0 0 1 0 0 0 1 0 1 0 0 

In [4]:
# Create a MultinomialNB model
nb = MultinomialNB()

# Fit the model
nb.fit(X_count_train, labels)

# Make predictions
preds = nb.predict(X_count_test)

print(preds, ";", preds == [1, 0])

# Calculate the accuracy
accuracy = (preds == [1, 0]).mean()
print("accuracy: ",accuracy)

[1 1] ; [ True False]
accuracy:  0.5


---

### Named Entity Recognition (NER) with Conditional Random Fields (CRF)

Description:

You are given a dataset consisting of sentences, where each word in a sentence is labeled with its corresponding named entity. The named entities are categorized into three types: PERSON, ORG (organization), LOC (location), or O (other, meaning no named entity). Your task is to implement a Named Entity Recognition (NER) system using Python that labels the entities in sentences using Conditional Random Fields (CRF).

In [11]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split
from sklearn_crfsuite.metrics import flat_classification_report
import nltk

In [6]:
sentences = [
    [("John", "PERSON"), ("works", "O"), ("at", "O"), ("Acme", "ORG"), (".", "O")],
    [("Paris", "LOC"), ("is", "O"), ("beautiful", "O"), ("in", "O"), ("spring", "O")],
    [("Mary", "PERSON"), ("lives", "O"), ("in", "O"), ("New", "LOC"), ("York", "LOC")],
]

test_sentences = [
    [("Alice", "O"), ("is", "O"), ("from", "O"), ("London", "LOC")],
    [("Google", "ORG"), ("was", "O"), ("founded", "O"), ("by", "O"), ("Larry", "PERSON"), ("and", "O"), ("Sergey", "PERSON")],
]

# Expected Output: Precision, recall, and F1-score (e.g., {"precision": 0.85, "recall": 0.80, "f1": 0.82})


In [12]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'word': word,
        'lowercase': word.lower(),
        'is_capitalized': word[0].isupper(),
        'is_digit': word.isdigit(),
        'prefix-1': word[:1],
        'suffix-1': word[-1:],
        'prefix-2': word[:2],
        'suffix-2': word[-2:]
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,
            '-1:is_capitalized': word1[0].isupper(),
            '-1:lowercase': word1.lower()
        })
    else:
        features['BOS'] = True  # Beginning of a sentence

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:is_capitalized': word1[0].isupper(),
            '+1:lowercase': word1.lower()
        })
    else:
        features['EOS'] = True  # End of a sentence

    return features

def sentence2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sentence2labels(sent):
    return [label for token, label in sent]

def sentence2tokens(sent):
    return [token for token, label in sent]


In [13]:
X_train = [sentence2features(s) for s in sentences]
y_train = [sentence2labels(s) for s in sentences]

X_test = [sentence2features(s) for s in test_sentences]
y_test = [sentence2labels(s) for s in test_sentences]


In [14]:
X_train 

[[{'word': 'John',
   'lowercase': 'john',
   'is_capitalized': True,
   'is_digit': False,
   'prefix-1': 'J',
   'suffix-1': 'n',
   'prefix-2': 'Jo',
   'suffix-2': 'hn',
   'BOS': True,
   '+1:word': 'works',
   '+1:is_capitalized': False,
   '+1:lowercase': 'works'},
  {'word': 'works',
   'lowercase': 'works',
   'is_capitalized': False,
   'is_digit': False,
   'prefix-1': 'w',
   'suffix-1': 's',
   'prefix-2': 'wo',
   'suffix-2': 'ks',
   '-1:word': 'John',
   '-1:is_capitalized': True,
   '-1:lowercase': 'john',
   '+1:word': 'at',
   '+1:is_capitalized': False,
   '+1:lowercase': 'at'},
  {'word': 'at',
   'lowercase': 'at',
   'is_capitalized': False,
   'is_digit': False,
   'prefix-1': 'a',
   'suffix-1': 't',
   'prefix-2': 'at',
   'suffix-2': 'at',
   '-1:word': 'works',
   '-1:is_capitalized': False,
   '-1:lowercase': 'works',
   '+1:word': 'Acme',
   '+1:is_capitalized': True,
   '+1:lowercase': 'acme'},
  {'word': 'Acme',
   'lowercase': 'acme',
   'is_capitalized

In [17]:
y_train

[['PERSON', 'O', 'O', 'ORG', 'O'],
 ['LOC', 'O', 'O', 'O', 'O'],
 ['PERSON', 'O', 'O', 'LOC', 'LOC']]

In [15]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)

crf.fit(X_train, y_train)


In [16]:
y_pred = crf.predict(X_test)

print(flat_classification_report(
    y_test, y_pred, labels=['PERSON', 'ORG', 'LOC', 'O'], digits=3
))

              precision    recall  f1-score   support

      PERSON      0.000     0.000     0.000         2
         ORG      0.000     0.000     0.000         1
         LOC      0.333     1.000     0.500         1
           O      0.857     0.857     0.857         7

    accuracy                          0.636        11
   macro avg      0.298     0.464     0.339        11
weighted avg      0.576     0.636     0.591        11



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st

def distribution_fitting(distribution_code, investments):
    """
    :param distribution_code (string): 'norm', 'cauchy', or 'expon'.
    :param investments: (list) The size of each investment received.
    :returns: (float) Akaike information criterion of the distribution for the dataset.
    """
    # Convert investments to a numpy array for easier manipulation
    investments = np.array(investments)
    
    # Fit the distribution to the data
    if distribution_code == 'norm':
        params = st.norm.fit(investments)
        log_likelihood = np.sum(st.norm.logpdf(investments, *params))
        k = 2  # mean and standard deviation
    elif distribution_code == 'cauchy':
        params = st.cauchy.fit(investments)
        log_likelihood = np.sum(st.cauchy.logpdf(investments, *params))
        k = 2  # location and scale
    elif distribution_code == 'expon':
        params = st.expon.fit(investments)
        log_likelihood = np.sum(st.expon.logpdf(investments, *params))
        k = 2  # location and scale
    else:
        raise ValueError("Invalid distribution code. Use 'norm', 'cauchy', or 'expon'.")

    # Calculate AIC
    aic = 2 * k - 2 * log_likelihood
    
    return aic

# Test the function
investments = [
    11624, 9388, 9471, 8927,
    10865, 7698, 11744, 9238,
    10319, 9750, 11462, 7939
]
print(distribution_fitting('norm', investments))  # Expected output ~210.24074

210.24073823727426
