**BLG 202E - PROJECT 2 - NAZRIN ABDINLI - 150220925**

In [2]:
#libraries
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

*Part 1 - Data Loading*

In [3]:
#loading the dataset
df = pd.read_csv("comcast_consumeraffairs_complaints.csv")

df['posted_on'] = pd.to_datetime(df['posted_on'])                   #filtering complaints from 2009 onwards
df_filter_2009 = df[df['posted_on'].dt.year >= 2009].copy()     
df_filter_2009.dropna(subset=['text'], inplace=True)                #removing rows with missing complaint details  

#text preprocessing
stop_words = set(stopwords.words('english'))
port_stem = PorterStemmer()
wn_lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())                                    #tokenization and lowercasing             
    tokens = [token for token in tokens if token.isalnum()]                 #removing punctuation
    tokens = [token for token in tokens if token not in stop_words]         #removing stopwords
    tokens = [port_stem.stem(token) for token in tokens]                    #stemming
    tokens = [wn_lemmatizer.lemmatize(token) for token in tokens]           #lemmatization
    return " ".join(tokens)

df_filter_2009['processed_text'] = df_filter_2009['text'].apply(preprocess_text)

print(df_filter_2009)
print(df_filter_2009['processed_text'])

                             author  posted_on  rating  \
0       Alantae of Chesterfeild, MI 2016-11-22       1   
1          Vera of Philadelphia, PA 2016-11-19       1   
2       Sarah of Rancho Cordova, CA 2016-11-17       1   
3          Dennis of Manchester, NH 2016-11-16       1   
4              Ryan of Bellevue, WA 2016-11-14       1   
...                             ...        ...     ...   
5196           Paul of Martinez, CA 2009-01-04       0   
5197  Adelaide of Northwildwood, NJ 2009-01-03       0   
5198       Michelle of Richmond, CA 2009-01-03       0   
5199       Jesse of Newburyport, MA 2009-01-02       0   
5200   Winston of Port St Lucie, FL 2009-01-01       0   

                                                   text  \
0     I used to love Comcast. Until all these consta...   
1     I'm so over Comcast! The worst internet provid...   
2     If I could give them a negative star or no sta...   
3     I've had the worst experiences so far since in...   
4     Ch

*Part 2 - Creating Term-by-Document Matrix*

In [4]:
#initializing TfidfVectorizer with desired parameters
vectorizer = TfidfVectorizer()

#fitting and transforming the sample data
term_doc_matrix = vectorizer.fit_transform(df_filter_2009['processed_text'])

print(term_doc_matrix)


  (0, 9452)	0.12653350694307097
  (0, 3901)	0.0588287526489019
  (0, 6434)	0.13523351221802102
  (0, 9416)	0.1921738012832514
  (0, 8287)	0.11753315739347929
  (0, 8480)	0.11827174352068655
  (0, 8546)	0.17824454642837595
  (0, 6852)	0.1301991481420399
  (0, 241)	0.12739419833561247
  (0, 909)	0.11471980074942563
  (0, 5526)	0.32831412777811037
  (0, 7866)	0.2370881406641892
  (0, 9364)	0.1611811768578798
  (0, 3181)	0.13406483051292015
  (0, 6423)	0.13898244699133835
  (0, 2676)	0.1504094341214357
  (0, 9394)	0.07764296612606345
  (0, 3362)	0.08400328259544916
  (0, 1927)	0.11609937348033708
  (0, 2563)	0.07278968743523499
  (0, 7867)	0.3312338469851849
  (0, 5846)	0.15123694403354335
  (0, 5227)	0.14712531742347082
  (0, 2416)	0.4978095237967047
  (0, 1776)	0.07306376743997113
  :	:
  (5057, 1091)	0.10973287265876155
  (5057, 5223)	0.11335891272829174
  (5057, 976)	0.09707878127914688
  (5057, 6736)	0.07792895889266109
  (5057, 7479)	0.09172560005246519
  (5057, 5315)	0.1866114288271

In [10]:
#getting sample data from original data for checking the code
sample_data = df_filter_2009["text"][:200]

vectorizer = TfidfVectorizer(max_features=300)

term_doc_matrix = vectorizer.fit_transform(sample_data)
print(term_doc_matrix)

  (0, 295)	0.12488781127211442
  (0, 102)	0.09345157065210982
  (0, 166)	0.07917429316932827
  (0, 176)	0.10760880020258352
  (0, 170)	0.07192251769831076
  (0, 198)	0.18425206890063509
  (0, 165)	0.09288969051104663
  (0, 2)	0.1874710652268045
  (0, 100)	0.12523935004952594
  (0, 112)	0.0704371851014994
  (0, 13)	0.16594201227300545
  (0, 95)	0.1682018076886719
  (0, 134)	0.11143038991206212
  (0, 36)	0.11728956032803683
  (0, 133)	0.13727421676613638
  (0, 8)	0.34409682091262905
  (0, 225)	0.374942130453609
  (0, 79)	0.2123079839948774
  (0, 286)	0.09756058409266898
  (0, 247)	0.06587819288445902
  (0, 173)	0.07230043515021628
  (0, 290)	0.13622799325800872
  (0, 89)	0.12488781127211442
  (0, 81)	0.2560860543936221
  (0, 57)	0.21802127755245213
  :	:
  (199, 298)	0.14280619547976742
  (199, 26)	0.07970491216418446
  (199, 46)	0.06976033618557186
  (199, 130)	0.10878690069127558
  (199, 293)	0.0802175947214536
  (199, 125)	0.17026979085552002
  (199, 193)	0.12879950319126698
  (199, 1

*Part 3 - Computing SVD and Implementation*

In [6]:
#custom SVD function
def svd(A):
    Ui = A.dot(A.transpose())
    Vi = A.transpose().dot(A)
    _,eig_values_U, U = custom_eig(Ui)
    U = U.real
    eig_values_U.real
    _,eig_values_V, V = custom_eig(Vi)
    V = V.real
    eig_values_V.real
    # Sorting eigenvalues and corresponding eigenvectors in descending order
    idx_U = np.argsort(eig_values_U)[::-1]
    idx_V = np.argsort(eig_values_V)[::-1]
    U = U[:, idx_U]
    V = V[:, idx_V]
    # Taking square root of positive eigenvalues
    Si = np.sqrt(np.maximum(eig_values_U, 0))
    S = np.diag(Si)
    return U, S, V.transpose()

#custom eigenvalue decomposition function
def custom_eig(matrix, epsilon=1e-10, max_iterations=1000):
    matrix_dense = matrix.toarray()  # Convert to dense array
    m, n = matrix_dense.shape
    eigenvalues = np.zeros(n)
    eigenvectors = np.eye(n)
    U = np.eye(m)  # Initialize U as a 2D identity matrix
    for i in range(n):
        v = np.random.rand(n)
        for _ in range(max_iterations):
            v_next = np.dot(matrix_dense, v)
            v_next_norm = np.linalg.norm(v_next)
            v_next /= v_next_norm
            eigenvalue = np.dot(v_next, np.dot(matrix_dense, v_next))
            if np.abs(eigenvalue - eigenvalues[i]) < epsilon:
                break
            v = v_next
            eigenvalues[i] = eigenvalue
        eigenvectors[:, i] = v_next
        U[:, i] = v  # Update U with the computed eigenvector
        matrix_dense -= eigenvalues[i] * np.outer(v_next, v_next)
    return eigenvectors, eigenvalues, U

#computing SVD 
U, Sigma, V_T = svd(term_doc_matrix)

#printing shapes of resulting matrices
print("Shape of U:", U.shape)
print("Shape of Sigma:", Sigma.shape)
print("Shape of V_T:", V_T.shape)

#evaluating SVD Approximation
def calculate_mse(original_matrix, reconstructed_matrix):
    #MSE
    mse = np.mean(np.square(original_matrix - reconstructed_matrix))
    return mse

def calculate_frobenius_norm(original_matrix, reconstructed_matrix):
    #frobenius norm
    fn = np.linalg.norm(original_matrix - reconstructed_matrix, ord='fro')
    return fn

Shape of U: (200, 200)
Shape of Sigma: (200, 200)
Shape of V_T: (300, 300)


In [8]:
#printing dimensions of term_doc_matrix
t, d = term_doc_matrix.shape
print("t:", t)
print("d:", d)

min_k = max(10, min(t, d) // 10 + 1)        #determining minimum value of k (number of singular values to keep)
k_values = []                               #generating a list of k values to iterate over
for i in range(min_k, min(t, d) + 1, 20):
    print(i)
    k_values.append(i)

print("k_values:", list(k_values))

mse_values = []
fn_values = []

for k in k_values:
    U_k, Sigma_k, V_T_k = svd(term_doc_matrix)                  #performing SVD on term_doc_matrix
    U_k = U_k[:, :k]
    Sigma_k = Sigma_k[:k, :k]                                   #keeping only first k singular values
    V_T_k = V_T_k[:, :k]
    reconstructed_matrix = np.dot(np.dot(U_k, Sigma_k), V_T_k.transpose())      #reconstructing the matrix using the truncated SVD

    #calculating Mean Squared Error (MSE) and Frobenius Norm (FN)
    mse = calculate_mse(term_doc_matrix, reconstructed_matrix)  
    fn = calculate_frobenius_norm(term_doc_matrix, reconstructed_matrix)  
    mse_values.append(mse)
    fn_values.append(fn)
    
    print("k:", k)
    print("MSE:", mse)
    print("Frobenius Norm:", fn)
    print("="*50)

#checking if MSE and FN lists are empty   
if not mse_values:
    print("Error: mse_values is empty.")
if not fn_values:
    print("Error: fn_values is empty.")

print("Length of mse_values:", len(mse_values))
print("Length of fn_values:", len(fn_values))

#finding optimal k values for MSE and FN
optimal_k_mse = k_values[np.argmin(mse_values)]
optimal_k_fn = k_values[np.argmin(fn_values)]

print("Optimal k for MSE:", optimal_k_mse)
print("Optimal k for Frobenius Norm:", optimal_k_fn)


t: 200
d: 300
21
41
61
81
101
121
141
161
181
k_values: [21, 41, 61, 81, 101, 121, 141, 161, 181]
k: 21
MSE: 0.04907505524265239
Frobenius Norm: 54.26327777198078
k: 41
MSE: 0.06301750588247766
Frobenius Norm: 61.49024599843995
k: 61
MSE: 0.07275902377833385
Frobenius Norm: 66.07224399625026
k: 81
MSE: 0.08850652790041792
Frobenius Norm: 72.87243425346155
k: 101
MSE: 0.09063798802609442
Frobenius Norm: 73.74468985334242
k: 121
MSE: 0.09282712063094313
Frobenius Norm: 74.62993526632987
k: 141
MSE: 0.08987034846763518
Frobenius Norm: 73.43174319092603
k: 161
MSE: 0.09955969031462826
Frobenius Norm: 77.28894758552283
k: 181
MSE: 0.09193111256014257
Frobenius Norm: 74.26888146194578
Length of mse_values: 9
Length of fn_values: 9
Optimal k for MSE: 21
Optimal k for Frobenius Norm: 21


*Part 4 - Query-Document Cosine Similarity*

In [9]:
#query-document cosine similarity
queries = [
    ['ignorant', 'overwhelming'],
    ['xfinity', 'frustrate', 'adapter', 'verizon', 'router'],
    ['terminate', 'rent', 'promotion', 'joke', 'liar', 'internet', 'horrible'],
    ['kindergarten', 'ridiculous', 'internet', 'clerk', 'terrible']
]

#calculating TF-IDF matrix for queries
query_matrix = vectorizer.transform([' '.join(query) for query in queries]).toarray()

#calculating cosine similarity 
cosine_similarities = []
for query in query_matrix:
    sim = []
    for doc in term_doc_matrix.toarray():
        sim.append(np.dot(query, doc) / (np.linalg.norm(query) * np.linalg.norm(doc)))
    cosine_similarities.append(sim)

cosine_similarities = np.array(cosine_similarities)

#finding the most relevant document for each query
for i, query in enumerate(queries):
    most_similar_doc_index = np.argmax(cosine_similarities[i])
    most_similar_doc_text = df_filter_2009.iloc[most_similar_doc_index]['text']
    print(f"Most relevant document for query {i+1}:")
    print("Text:", most_similar_doc_text)
    print("Cosine Similarity:", cosine_similarities[i, most_similar_doc_index])
    print("="*50)

Most relevant document for query 1:
Text: I used to love Comcast. Until all these constant updates. My internet and cable crash a lot at night, and sometimes during the day, some channels don't even work and on demand sometimes don't play either. I wish they will do something about it. Because just a few mins ago, the internet have crashed for about 20 mins for no reason. I'm tired of it and thinking about switching to Wow or something. Please do not get Xfinity.
Cosine Similarity: nan
Most relevant document for query 2:
Text: I called Xfinity to troubleshoot my internet at 4 am 2/17/2016, got connected to someone who I swear came to this country on a floating door as I had to tell him twice I was outside and not near the router, had to tell him three times I was NOT going to set an appointment for a service technician as I wasn't going to pay $25 for that.He tried resetting my router which did not work, and I eventually hung up on him, went on my phone to xfinity chat and was told aga

  sim.append(np.dot(query, doc) / (np.linalg.norm(query) * np.linalg.norm(doc)))
