# Machine Learning Classification
- Perform Cosine similarity on each input data point
- Compare to sample profiles to classify into various classes
- Can expand to more than 1

In [12]:
# Generate example data in pd.DataFrame() format
import pandas as pd
df = pd.DataFrame(
    [
        {"name": "Zain", "link": "zain.com", "headline": "AI student at UCL", "location": "London"},
        {"name": "Tim", "link": "tim.com", "headline": "CEO of AMSTRAD", "location": "London"},
        {"name": "Jude Samuel", "link": "ingloriousgrapplers.com", "headline": "Head Coach at Inglorious Grapplers | Matchmaker at Bellator", "location": "London"}
    ]
)

In [2]:
# Use Sentence Transformers for Feature Extraction: 
from sentence_transformers import SentenceTransformer

# Load a model specifically trained for sentence embeddings
model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import numpy as np

true: str = """
Name: Julian Cross
Location: London, UK
Headline: London-based C-Level Executive | Driving Operational Excellence & Strategic Growth | Helping Businesses scale with purpose
"""
false: str = """
Name: Chloe Wilson
Location: California, USA
Headline: Gender Studies student at Santa Monica College | Focussed on social justice advocacy and community organising | Pet lover
"""
comp: np.ndarray = np.array([model.encode(true), model.encode(false)])
print(comp.shape)
print(tmp.shape)

dot = np.dot(tmp, comp.T)
abs = np.dot(np.linalg.norm(tmp, axis=1, keepdims=True), np.linalg.norm(comp, axis=1, keepdims=True).T)
sim = dot / abs
exp = np.exp(sim)
sum = np.sum(exp, axis=1, keepdims=True)
prob = exp / sum
print(prob)

(2, 768)
(3, 768)
[[0.506789   0.49321103]
 [0.55170304 0.4482969 ]
 [0.5217679  0.4782321 ]]


In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
# Load a model specifically trained for sentence embeddings

class Model:
    """
    Require a pandas series of texts and outputs the 
    """
    def __init__(self):
        self.model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")
        self.samples = None
    
    def run(self, text: pd.Series) -> pd.Series:
        """Run start to finish from a series of texts to a series of true or false"""
        embeddings = self._genEmbeddings(text)
        self._getProf()
        similarity = self._calcSim(embeddings)
        classes = self._classify(similarity)
        return self._construct(classes)


    def _genEmbeddings(self, text: pd.Series) -> np.ndarray:
        # Refactor the dataframe into an array of strings then feed self.model.encode(array_of_strings)
        return self.model.encode(text)

    def _getProf(self) -> np.ndarray:
        # Provie 2 example profiles, generate embeddings and store in array
        # Used to dot product and find cosine similarity with other data points
        true: str = """
Name: Julian Cross
Location: London, UK
Headline: London-based C-Level Executive | Driving Operational Excellence & Strategic Growth | Helping Businesses scale with purpose
"""
        false: str = """
Name: Chloe Wilson
Location: California, USA
Headline: Gender Studies student at Santa Monica College | Focussed on social justice advocacy and community organising | Pet lover
"""
        comp: np.ndarray = np.array([self.model.encode(true), self.model.encode(false)])
        self.samples = comp
        return comp

    def _calcSim(self, embeddings: np.ndarray) -> np.ndarray:
        # Apply dot product to cosine similarity then softmax with 2 other pre-calculated embeddings

        dot = np.dot(embeddings, self.samples.T)
        abs = np.dot(np.linalg.norm(embeddings, axis=1, keepdims=True), np.linalg.norm(self.samples, axis=1, keepdims=True).T)
        sim = dot / abs
        exp = np.exp(sim)
        sum = np.sum(exp, axis=1, keepdims=True)
        prob = exp / sum
        return prob
    
    def _classify(self, arr: np.ndarray):
        # arr should be have 2 columns. Convert into true or false
        return arr[:, 0] >= arr[:, 1]

    def _construct(self, arr: np.ndarray):
        # Create Pandas series of True or False
        return pd.Series(arr)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
df = pd.Series(
    [
        """
        name: Zain, 
        location: London, UK, 
        headline: AI student at UCL, location: London
        """,
        """
        name: Tim, 
        location: London, UK, 
        headline: CEO of AMSTRAD
        """,
        """
        name: Jude Samuel, 
        location: London, UK, 
        headline: Head Coach at Inglorious Grapplers | Matchmaker at Bellator
        """,
    ]
)

In [3]:
client = Model()
out = client.run(df)
print(out)

0    True
1    True
2    True
dtype: bool


In [4]:
print(type(out))

<class 'pandas.core.series.Series'>
