In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.patches import Patch
import numpy as np
%matplotlib inline

from category_encoders import *
from IPython.display import display
import scipy.stats as sp
import datetime as dt
import re
#import categorical_embedder
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
le = LabelEncoder()
sns.set(rc={'figure.figsize':(14, 8)})

In [None]:
datainput = pd.read_csv(r"C:\Users\Ighdaro Emwinghare\Downloads\filtered_req_calls.csv")

In [None]:
datainput.T

In [None]:
#since we are interested in analyizing different product_ids, it is necessary to drop rows that contain null as product_id
data = datainput
data  = data.dropna(axis=0, subset=['product_id'])

In [None]:
data.isnull().sum()

In [None]:
print(dict(data['product_id'].value_counts()))

In [None]:
data['monthyear'] = pd.to_datetime(datainput['date']).dt.to_period('M')

In [None]:
#fill missing values with specific indicator.
data['content_category'] = data['content_category'].fillna('Unknown')
data['touchpoint_channel_clm'] = data['touchpoint_channel_clm'].fillna('Unknown')
data['content_message_local'] = data['content_message_local'].fillna('NoMessage')
data['content_message_global'] = data['content_message_global'].fillna('NoMessage')

In [None]:
dict(data['content_message_local'].value_counts())

In [None]:
dict(data['content_message_global'].value_counts())

In [None]:
#reform the content_message_concat by combining content_message_Local, 'sgm' and content_message_global
data['tactic'] = data['content_message_local'] + ' SGM ' + data['content_message_global']

In [None]:
data['tactic'].nunique()

In [None]:
data['tactic'].value_counts()

In [None]:
data.isnull().sum()/len(data)

#99% values in tactic, content_category, content_message_local and content_message_global are missing.

In [None]:
data.columns

In [None]:
"""Helper Functions"""


def get_cat_feats(data=None):
    '''
    Returns the categorical features in a data set
    Parameters:
    -----------
        data: DataFrame or named Series 
    Returns:
    -------
        List
            A list of all the categorical features in a dataset.
    it is used as a helper function for most of the functions to get categorical variables
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    cat_features = data.select_dtypes(include=['object']).columns

    return list(cat_features)
def get_num_feats(data=None):
    '''
    Returns the numerical features in a data set
    Parameters:
    -----------
        data: DataFrame or named Series 
    Returns:
    -------
        List:
            A list of all the numerical features in a dataset.
    it is used as a helper function for most of the functions to get categorical variables
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    num_features = data.select_dtypes(include=numerics).columns

    return list(num_features)
def get_unique_counts(data=None):
    '''
    Gets the unique count of categorical features in a data set.
    Parameters
    -----------
        data: DataFrame or named Series 
    Returns
    -------
        DataFrame or Series
            Unique value counts of the features in a dataset.
    it is used as a helper function in the describe function to get the count of unique values in the columns 
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    features = get_cat_feats(data)
    temp_len = []

    for feature in features:
        temp_len.append(len(data[feature].unique()))
        
    df = list(zip(features, temp_len))
    df = pd.DataFrame(df, columns=['Feature', 'Unique Count'])
    df = df.style.bar(subset=['Unique Count'], align='mid')
    return df
def display_missing(data=None, plot=False):
    '''
    Display missing values as a pandas dataframe.
    Parameters
    ----------
        data: DataFrame or named Series
        plot: bool, Default False
            Plots missing values in dataset as a heatmap
    
    Returns
    -------
        Matplotlib Figure:
            Heatmap plot of missing values
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    df = data.isna().sum()
    df = df.reset_index()
    df.columns = ['features', 'missing_counts']

    missing_percent = round((df['missing_counts'] / data.shape[0]) * 100, 1)
    df['missing_percent'] = missing_percent

    if plot:
        plot_missing(data)
        return df
    else:
        return df
    
def cat_summarizer(data, x=None, y=None, hue=None, palette='Set1', verbose=True):
    '''
    Helper function that gives a quick summary of a given column of categorical data
    Parameters:
    ---------------------------
        dataframe: pandas dataframe
        x: str.
            horizontal axis to plot the labels of categorical data, y would be the count.
        y: str. 
            vertical axis to plot the labels of categorical data, x would be the count.
        hue: str. i
            if you want to compare it another variable (usually the target variable)
        palette: array, list.
            Colour of the plot
    Returns:
    ----------------------
        Quick Stats of the data and also the count plot
        
        it is used in the describe function
    '''
    if x == None:
        column_interested = y
    else:
        column_interested = x
    series = data[column_interested]
    print(series.describe())
    print('mode: ', series.mode())
    if verbose:
        print('='*80)
        print(series.value_counts())

    sns.countplot(x=x, y=y, hue=hue, data=data, palette=palette)
    plt.show()
    
def _space():
    '''it is used in  most functions to add space. this makes result more presentation'''
    print('\n')
def _match_date(data):
    '''
        Return a list of columns that matches the DateTime expression
    '''
    mask = data.sample(20).astype(str).apply(lambda x : x.str.match(r'(\d{2,4}-\d{2}-\d{2,4})+').all())
    return set(data.loc[:, mask].columns)


def display_rows(data,num=2):
    '''
    Displays the required number of rows
    it is used in the describe function
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")

    return data.head(num)

def plot_missing(data=None):
    '''
    Plots the data as a heatmap to show missing values
    Parameters
    ----------
        data: DataFrame, array, or list of arrays.
            The data to plot.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    sns.heatmap(data.isnull(), cbar=True)
    plt.show()
    
def class_count(data=None, features=None, plot=False, save_fig=False):
    '''
    Displays the number of classes in a categorical feature.
    Parameters:
    
        data: Pandas DataFrame or Series
            Dataset for plotting.
        features: Scalar, array, or list. 
            The categorical features in the dataset, if None, 
            we try to infer the categorical columns from the dataframe.
        plot: bool, Default False.
            Plots the class counts as a barplot
        save_fig: bool, Default False.
            Saves the plot to the current working directory.
    it is used in the describe function
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    if features is None:
        features = get_cat_feats(data)

                        

    for feature in features:
        if data[feature].nunique() > 15:
            print("Unique classes in {} too large".format(feature))
        else:
            print('Class Count for', feature)
            display(pd.DataFrame(data[feature].value_counts()))

    if plot:
        countplot(data, features, save_fig=save_fig)
        
def get_date_cols(data=None):
    '''
    Returns the Datetime columns in a data set.
    Parameters
    ----------
        data: DataFrame or named Series
            Data set to infer datetime columns from.
        convert: bool, Default True
            Converts the inferred date columns to pandas DateTime type
    Returns:
    -------
        List
         Date column names in the data set
    use in the describe function to set date columns to datetime datatype in utc
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    #Get existing date columns in pandas Datetime64 format
    date_cols = set(data.dtypes[data.dtypes == 'datetime64[ns, UTC]'].index)
    #infer Date columns 
    date_cols = date_cols.union(_match_date(data))
       
    return date_cols

def bivariate_stats(data):
    '''Returns the contingency table and chi2 contingency test result between columns in the dataframe
        
        it is used in the describe function for categorical features analysis 
    
    
    '''
    cat_feats = get_cat_feats(data=data)
    counter = 1
    try:
        while counter<(len(cat_feats)):
            val1 = get_cat_feats(data=datainput)[counter - 1]
            val2 = get_cat_feats(data=datainput)[counter]
            if (data[val1].nunique() > 15) or (data[val2].nunique() > 15):
                print('Number of unique values too large')
            else:
                freqtab = pd.crosstab(data[val1], data[val2])
                print("Frequency table")
                print("============================")
                print(freqtab)
                print("============================")
                chi2, pval, dof, expected = sp.chi2_contingency(freqtab)
                print("ChiSquare test statistic: ",chi2)
                print("p-value: ",pval)
                _space()
            counter= counter+1
    except:
        pass
    
def bivariate_stats_target(data, target):
    
    '''Returns the contingency table and chi2 contingency test result between columns and the target variable in the dataframe
        
        
        Parameters
    ----------
        data: DataFrame or named Series
            Data set to infer datetime columns from.
        target: the target variable in form of string

        
        it is used in the describe function for categorical features analysis of the relationship between the target variable 
        and other categorical features
    
    
    '''
    cat_feats = get_cat_feats(data=data)
    for i in cat_feats:
        if (data[i].nunique() > 20):
            print('Number of Unique values too large')
        else:
            freqtab = pd.crosstab(data[i], data[target])
            print("Frequency table")
            print("============================")
            print(freqtab)
            print("============================")
            chi2, pval, dof, expected = sp.chi2_contingency(freqtab)
            print("ChiSquare test statistic: ",chi2)
            print("p-value: ",pval)
            _space()
            
def describe(data=None, name='', date_cols=None, show_categories=False, plot_missing=False, target = None):
    '''
    Calculates statistics and information about a data set. Information displayed are
    shapes, size, number of categorical/numeric/date features, missing values,
    dtypes of objects, correlation analysis, contigency analysis etc.
    Parameters:
    --------------------
        data: Pandas DataFrame
            The data to describe.
        name: str, optional
            The name of the data set passed to the function.
        date_cols: list/series/array
            Date column names in the data set.
        show_categories: bool, default False
            Displays the unique classes and counts in each of the categorical feature in the data set.
        plot_missing: bool, default True
            Plots missing values as a heatmap
        target: the target variable in the dataframe
    Returns:
    -------
        None
        
        This function is stand alone use for quick statistical exploration of the data.
    '''
    
    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    ## Get categorical features
    cat_features = get_cat_feats(data)
    
    #Get numerical features
    num_features = get_num_feats(data)

    print('First five data points')
    display(data.head())
    _space()

    print('Random five data points')
    display(data.sample(5))
    _space()

    print('Last five data points')
    display(data.tail())
    _space()

    print('Shape of {} data set: {}'.format(name, data.shape))
    _space()

    print('Size of {} data set: {}'.format(name, data.size))
    _space()

    print('Data Types')
    print("Note: All Non-numerical features are identified as objects in pandas")
    display(pd.DataFrame(data.dtypes, columns=['Data Type']))
    _space()
    
    date_cols = get_date_cols(data)
    if len(date_cols) is not 0:
        print("Column(s) {} should be in Datetime format. Use the [to_date] function to convert to Pandas Datetime format".format(date_cols))
        _space()

    print('Numerical Features in Data set')
    print(num_features)
    _space()

    print('Categorical Features in Data set')
    display(cat_features)
    _space()

    print('Statistical Description of Columns')
    display(data.describe())
    _space()
    
    print('Description of Categorical Features')
    if cat_features != None:
        display(data.describe(include=[np.object, pd.Categorical]).T)
        _space()
          
    print('Unique class Count of Categorical features')
    display(get_unique_counts(data))
    _space()

    if show_categories:     
        print('Classes in Categorical Columns')
        print("-"*30)
        class_count(data, cat_features)
        _space()

    print('Missing Values in Data')
    display(display_missing(data))
    _space()
  
    print('Pearson Correlation')
    print(data.corr())
    _space()
    
    print('Kendall Correlation')
    print(data.corr(method='kendall'))
    _space()
    
    print('Spearman Correlation')
    print(data.corr(method='spearman'))
    _space()
    
    print('Bivariant Stats between categorical features')
    print(bivariate_stats(data))
    _space()
    if target is not None:
        print('Bivariant Stats between cat feats and target variable')
        print(bivariate_stats_target(data, target))
        _space()
    print('')
    

def drop_missing(data=None, percent=99):
    '''
    Drops missing columns with [percent] of missing data.
    Parameters:
    -------------------------
        data: Pandas DataFrame or Series.
        percent: float, Default 99
            Percentage of missing values to be in a column before it is eligible for removal.
    Returns:
    ------------------
        Pandas DataFrame or Series.
    It can be used alone. It also used in deal_with_missing_value function.
    
    This function is used in the deal_with_missing_value function.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    missing_percent = (data.isna().sum() / data.shape[0]) * 100
    cols_2_drop = missing_percent[missing_percent.values >= percent].index
    print("Dropped {}".format(list(cols_2_drop)))
    #Drop missing values
    df = data.drop(cols_2_drop, axis=1)
    return df

def fill_missing_cats(data=None, cat_features=None, missing_encoding=None, missing_col=False):
    '''
    Fill missing values using the mode of the categorical features.
    Parameters:
    ------------------------
        data: DataFrame or name Series.
            Data set to perform operation on.
        cat_features: List, Series, Array.
            categorical features to perform operation on. If not provided, we automatically infer the categoricals from the dataset.
        missing_encoding: List, Series, Array.
            Values used in place of missing. Popular formats are [-1, -999, -99, '', ' ']
        missin_col: bool, Default True
      Creates a new column to capture the missing values. 1 if missing and 0 otherwise. This can sometimes help a machine learning model.
      
      This function is used in the deal_with_missing_value function.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")

    if cat_features is None:
        cat_features = get_cat_feats(data)

    df = data.copy()
    #change all possible missing values to NaN
    if missing_encoding is None:
        missing_encoding = ['', ' ', -99, -999]

    df.replace(missing_encoding, np.NaN, inplace=True)
    
    for feat in cat_features:
        if missing_col:
            df[feat + '_missing_value'] = (df[feat].isna()).astype('int64')
        most_freq = df[feat].mode()[0]
        df[feat] = df[feat].replace(np.NaN, most_freq)
    
    return df

def fill_missing_num(data=None, num_features=None, method='mean', missing_col=False):
    '''
    fill missing values in numerical columns with specified [method] value
    Parameters:
        ------------------------------
        data: DataFrame or name Series.
            The data set to fill
        features: list.
            List of columns to fill
        method: str, Default 'mean'.
            method to use in calculating fill value.
        missing_col: bool, Default True
          Creates a new column to capture the missing values. 1 if missing and 0 otherwise. This can sometimes help a machine learning model.
          
          This function is used in the deal_with_missing_value function.
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if num_features is None:
        num_features = get_num_feats(data)
        #get numerical features with missing values
        temp_df = data[num_features].isna().sum()
        features = list(temp_df[num_features][temp_df[num_features] > 0].index)
        
    df = data.copy()
    for feat in features:
        if missing_col:
            df[feat + '_missing_value'] = (df[feat].isna()).astype('int64')
        if method is 'mean':
            mean = df[feat].mean()
            df[feat].fillna(mean, inplace=True)
        elif method is 'median':
            median = df[feat].median()
            df[feat].fillna(median, inplace=True)
        elif method is 'mode':
            mode = df[feat].mode()[0]
            df[feat].fillna(mode, inplace=True)
        else:
            raise ValueError("method: must specify a fill method, one of [mean, mode or median]'")
    return df

def deal_with_missing_value(data, percent=70):
    """
    this function automatically take care of missing values.
        It fills the missing values in categorical variables with mode of the particular column
        and fills the missing value numerical variables with mean of the particular column.
        It automatically drops columns with more than 70% missing values except when set otherwise.
        
        This function is used in the feature_preprocessing function to deal with missing values. It can also be used alone.
        """
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    df1 = drop_missing(data=data, percent=percent)
    df2 = fill_missing_cats(data=df1)
    df = fill_missing_num(data=df2)
    return df

def drop_redundant(data):
    '''
    Removes features with the same value in all cell. Drops feature If Nan is the second unique class as well.
    Parameters:
    -----------------------------
        data: DataFrame or named series.
    
    Returns:
        DataFrame or named series.
    This function is used in the feature_processing function.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    #get columns
    cols_2_drop = _nan_in_class(data)
    print("Dropped {}".format(cols_2_drop))
    df = data.drop(cols_2_drop, axis=1)
    return df
def _nan_in_class(data):
    """helper function for drop_redundant function"""
    cols = []
    for col in data.columns:
        if len(data[col].unique()) == 1:
            cols.append(col)

        if len(data[col].unique()) == 2:
            if np.nan in list(data[col].unique()):
                cols.append(col)

    return cols

#Label Encoding for object to numeric conversion
def binaryencoder(data):
    """To avoid the curse of dimensionality, this function only encodes categorical features with less than 7 unique values
    
        This function can be used alone. It is also used in the encode_data function.
        
        It is the default method for encoding categorical variables with unique value less than four in the encode_data function.
        
        explanation: It is similar to onehot encoding but gives lesser dimensions, making it a better option. 
                    It converts the unique entry into binary combination and then creates column using binary hashing.
    """
    features = get_cat_feats(data=data)
    cols = []
    for feature in features:
        if data[feature].nunique() < 7:
            cols.append(feature)
        
    enc = BinaryEncoder(cols=cols).fit(data)
    data = enc.transform(data)
    return data

def onehotencoder(data):
    """To avoid the curse of dimensionality, this function only encodes categorical features with less than 4 unique values
    
        This function can be used alone. It is also used in the encode_data function
        
    The onehotencoder is only used when the number of unique value is less than four to avoid the curse of dimensionality.
    If encode_data method parameter is set to 'onehotencode' this is what is used in encoding categorical variables with 
    number of unique values less than 4.
    
    explanation: it is used mostly for nominal variables such that a binary combination of the unique values are set as new columns
                    in the dataset.
    """
    features = get_cat_feats(data=data)
    cols = []
    for feature in features:
        if data[feature].nunique() < 4:
            cols.append(feature)
        
    enc = OneHotEncoder(cols=cols).fit(data)
    data = enc.transform(data)
    return data

def labelencoder(data):
    """
    This function can be used alone. It is also used in the encode_data function.
    
    It is used for columns that has more than 3 unique values. Such columns are treated as ordinal variables. 
    
    Explanation: Label encoders are ordinal encoders that encode unique values as continuous intergers.
    
    """
    features = get_cat_feats(data=data)
    for feat in features:
        data[feat] = le.fit_transform(data[feat].astype(str))
    return data

def sumencoder(data):
    """
    This function can be used alone. It is also used in the encode_data function.
    
    The sumencoder is only used when the number of unique value is less than four to avoid the curse of dimensionality.
    If encode_data method parameter is set to 'sumencode' this is what is used in encoding categorical variables with 
    number of unique values less than 4.
    
    explanation: it is similar to one-hot encoding but the difference is that in sum encoding one value is taken as '-1'
                and it is not compared to other value.
    """

    features = get_cat_feats(data=data)
    cols = []
    for feature in features:
        if data[feature].nunique() < 4:
            cols.append(feature)
    enc = SumEncoder(cols = cols).fit_transform(data)
    data = enc
    return data

def catboostencoder(data, target):
    '''Data inputs must not be string
        This function is used alone. It is not called by any other function.
        
        it uses the catboost tree model in properly encoding categorical features.
    
        explanation: a target encoder. It uses the target variable in encoding the categorical variables. 
        It is more accurate than most encoding methods.
    '''
    X = data.drop(target, axis=1)
    y = data[target]
    features = get_cat_feats(data=X)
    enc = CatBoostEncoder(cols=features).fit(X,y)
    data = enc.transform(X, y)
    return data

def hashencoder(data):
    '''
         This function can be used alone. It is also used in the encode_data function.
    
        The hashencoder is only used when the number of unique value is less than four to avoid the curse of dimensionality.
        If encode_data method parameter is set to 'hashencode' this is what is used in encoding categorical variables with 
        number of unique values less than 4.

        explanation: Feature hashing maps each category in a categorical feature to an integer within a predetermined range
    
                        The size of the output dimensions is controlled by the variable n_components.
    '''    
    
    cols = get_cat_feats(data)
    new_col = []
    for i in cols:
        string = str(data[i][0]) + str(data[i][len(data)-1]) 
        flag = re.findall(r'\d+', string)
        if len(flag) > 2:
               if len(flag[0])>2:
                    new_col.append(i)
        
    enc = HashingEncoder(cols=new_col, n_components= 1).fit(data)
    data = enc.transform(data)
    
    return data

def embeddingencoder(data):
     """
        This function is used alone.
        It uses neural network embeddings to encode categorical features. 
         """
     embedding_info = ce.get_embedding_info(data)
     X_encoded,encoders = ce.get_label_encoded_data(data)

     return X_endoded

def encode_data(data, method='binary'):
    """
        encodes categorical variables automatically using binary encoding for columns with less than 4 unique values
        then label encode all other variables
        method takes either binary or onehot or sumencode or hashcode. default is binary
        
        this function can be used alone and it also used in the feature_processing function.
        
        
    """
    if method == 'binary':
        data = binaryencoder(data)
    elif method== 'onehot':
        data = onehotencoder(data)
    elif method == 'sumencode':
        data = sumencode(data)
    data = labelencoder(data)
    return data

In [None]:
data = deal_with_missing_value(data, percent=60)

In [None]:
#data = drop_redundant(data)

In [None]:
describe(data=data, name='', date_cols=None, show_categories=False, plot_missing=False, target = None)

In [None]:
data['frequency_of_content_category'] = 1

In [None]:
grouped1 = data.groupby(by = ['product_id','monthyear', 'content_category'])['frequency_of_content_category'].sum().reset_index()

# No 'content_category', 'content_message_local', 'content_message_global', 'product', 'indication', 'therapeutic_area', 'segment_quant', 'touchpoint_channel_clm','account_id'

In [None]:
grouped2 = data.groupby(by = ['product_id','monthyear', 'content_message_local',
       'content_message_global', 'product', 'indication', 'therapeutic_area', 'segment_quant', 'touchpoint_channel_clm',
       'tactic', 'account_id', 'content_category'])['frequency_of_content_category'].sum().reset_index()

In [None]:
grouped3 = data.groupby(by = ['product_id','monthyear', 'tactic','content_message_local',
       'content_message_global', 'product', 'indication', 'therapeutic_area', 'segment_quant', 'touchpoint_channel_clm',
       'content_category'])['frequency_of_content_category'].sum().reset_index()

# No 'account_id'

In [None]:
#grouped = data.groupby(by = ['product_id','content_category', 'content_message_local',
#       'content_message_global', 'product', 'indication', 'therapeutic_area', 'segment_quant', 'touchpoint_channel_clm',
#       'content_message_concat', 'monthyear'])['frequency_of_tactic'].sum().reset_index()

In [None]:
grouped1['product_id'].value_counts()

In [None]:
grouped2['product_id'].value_counts()

In [None]:
grouped3['product_id'].value_counts()

In [None]:
grouped_data=grouped3

In [None]:
values = dict(grouped_data['content_category'].value_counts())

In [None]:
values

In [None]:
list_of_diff_content_categories= values.keys()

In [None]:
counter = 1
mapper = dict()
for x in list_of_diff_content_categories:
    mapper.update( {x : counter} )
    counter = counter + 1

In [None]:
 pd.DataFrame(mapper.items(), columns=['content_category', 'encoded_content_category'])

In [None]:
grouped_data['encoded_content_category'] = grouped_data['content_category'].map(mapper)

In [None]:
grouped_data

In [None]:
describe(data=grouped_data, name='', date_cols=None, show_categories=False, plot_missing='frequency_of_tactic', target = None)

In [None]:
grouped_data['month'] = grouped_data['monthyear'].dt.month

In [None]:
grouped_data['year'] = grouped_data['monthyear'].dt.year

In [None]:
new_data1 = grouped_data[grouped_data['product']=='Skyrizi_PS']

In [None]:
new_data2 = grouped_data[grouped_data['product']=='Skyrizi_KAM']

In [None]:
new_data1 = new_data1.drop(['product_id', 'monthyear'], axis=1)

In [None]:
new_data2 = new_data2.drop(['product_id', 'monthyear'], axis=1)

In [None]:
new_data1 = encode_data(new_data1, method='binary')

In [None]:
new_data2 = encode_data(new_data2, method='binary')

In [None]:
new_data1 = new_data1.drop('tactic', axis=1)

In [None]:
#new_data2 = new_data2.drop('tactic', axis=1)

In [None]:
print('Skyrizi_PS')
sns.heatmap(new_data1.corr(), annot=True)

In [None]:
print('Skyrizi_KAM')
sns.heatmap(new_data2.corr(), annot=True)

In [None]:
print('Skyrizi_PS')
new_data1.columns

In [None]:
print('Skyrizi_KAM')
new_data2.columns

In [None]:
def standardscaler(data, cols = None):
    ''' 
    It is a standard normalization technique use when using machine learning models that assusme a normal/gaussian distribution.
    Models like Linear Regression, Gaussian Naive Bayes etc.
    -------------
        data: DataFrame, named Series
            Data set to perform operation on. It advisable not to scale/normalise the target variable.
        col: list of str
            columns in form of a list to scale/normalise. If not parsed, it scales/normalises the entire dataframe
        Returns:
    --------
        DataFrame of the scaled/normalised data/columns.
        
    It is used in the scale_normalise_data. It can also be used alone.
     '''
    if cols is not None:
        
        s_scaler = StandardScaler()
        data[cols] = s_scaler.fit_transform(data[cols])
        
        
    else:
        col_names = data.columns
        s_scaler = StandardScaler()
        df_s = s_scaler.fit_transform(data)
        data = pd.DataFrame(df_s, columns=col_names)
    
    return data

In [None]:
import statsmodels.api as sm
y = new_data1['frequency_of_content_category'].reset_index()
X = standardscaler(new_data1.drop('frequency_of_content_category', axis=1))

model = sm.OLS(y, X)
results = model.fit()
results.params

## Analysis of Content Message Concat

In [None]:
grouped_data['product_id'].value_counts()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime



df1 = grouped_data[['encoded_content_category','frequency_of_content_category','monthyear']]
df1.index = df1['monthyear']
pt = pd.pivot_table(df1, index=df1.index.month, columns=df1['encoded_content_category'], aggfunc='sum')['frequency_of_content_category']

ax = plt.figure().add_subplot(111)
ax.plot(pt)

ticklabels = [datetime.date(1900, item, 1).strftime('%b') for item in pt.index]
ax.set_xticks(np.arange(1,13))
ax.set_xticklabels(ticklabels) #add monthlabels to the xaxis

ax.legend(pt.columns.tolist(), loc='center left', bbox_to_anchor=(1, .5)) #add the column names as legend.
plt.tight_layout(rect=[0, 0, 0.85, 1])

plt.show()

In [None]:
counter = 1
mapper = dict()
for x in list_of_diff_content_categories:
    mapper.update( {x : f'content_category_{counter}'} )
    counter = counter + 1

In [None]:
grouped_data['encoded_content_category'] = grouped_data['content_category'].map(mapper)

In [None]:
grouped_data['encoded_content_category']

In [None]:
grouped_data['Date'] = grouped_data['monthyear'].values.astype('datetime64[M]')

In [None]:
grouped_data['product_id'].value_counts()

## Content Message Concat Analysis for Product a00G000000URPpNIAX

In [None]:
df1 = grouped_data[grouped_data['product_id']=='a00G000000URPpNIAX']

In [None]:
hue_order = list(dict(df1['encoded_content_category'].value_counts()).keys())

In [None]:
df1['encoded_content_category'].value_counts()

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df1, x='Date', y='frequency_of_content_category', hue='encoded_content_category', ax=ax, estimator='sum',ci = None)
#ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%M"))
fig.autofmt_xdate()

In [None]:
df1.groupby(['monthyear', 'encoded_content_category'])['frequency_of_content_category'].sum().unstack()

## Content Message Concat Analysis for Product a00G000000URPpGIAX

In [None]:
df2 = grouped_data[grouped_data['product_id']=='a00G000000URPpGIAX']

In [None]:
len(list(dict(df2['encoded_content_category'].value_counts()).keys()))

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df2, x='Date', y='frequency_of_content_category', hue='encoded_content_category', ax=ax, estimator='sum',ci = None)
#ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%M"))
fig.autofmt_xdate()

In [None]:
df2.groupby(['monthyear', 'encoded_content_category'])['frequency_of_content_category'].sum().unstack()

## Content Message Concat Analysis for Product a00G000000URPpMIAX

In [None]:
df3 = grouped_data[grouped_data['product_id']=='a00G000000URPpMIAX']

In [None]:
len(list(dict(df3['encoded_content_category'].value_counts()).keys()))

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df3, x='Date', y='frequency_of_content_category',ci = None, hue='encoded_content_category', ax=ax, estimator='sum')
#ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%M"))
fig.autofmt_xdate()

In [None]:
df3.groupby(['monthyear', 'encoded_content_category'])['frequency_of_content_category'].sum().unstack()

## Content Message Concat Analysis for Product a00G000000URPppIAH

In [None]:
df4 = grouped_data[grouped_data['product_id']=='a00G000000URPppIAH']

In [None]:
len(list(dict(df4['encoded_content_category'].value_counts()).keys()))

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df4, x='Date', y='frequency_of_content_category',ci = None, hue='encoded_content_category', ax=ax, estimator='sum')
#ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%M"))
fig.autofmt_xdate()

In [None]:
df4.groupby(['monthyear', 'encoded_content_category'])['frequency_of_content_category'].sum().unstack()

##  Content Message Concat Analysis for Product a00G000000URPpkIAH

In [None]:
df5 = grouped_data[grouped_data['product_id']=='a00G000000URPpkIAH']

In [None]:
len(list(dict(df5['encoded_content_category'].value_counts()).keys()))

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df5, x='Date', y='frequency_of_content_category',ci = None, hue='encoded_content_category', ax=ax, estimator='sum')
#ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%M"))
fig.autofmt_xdate()

In [None]:
df5.groupby(['monthyear', 'encoded_content_category'])['frequency_of_content_category'].sum().unstack()

##  Content Message Concat Analysis for Product a001v00001wbtlnAAA

In [None]:
df6 = grouped_data[grouped_data['product_id']=='a001v00001wbtlnAAA']

In [None]:
len(list(dict(df6['encoded_content_category'].value_counts()).keys()))

In [None]:
dict(df6['encoded_content_category'].value_counts())

In [None]:
value_list = ['content_category_1',
 'content_category_2',
 'content_category_3',
 'content_category_4',
 'content_category_5']
df_1 = df6[df6['encoded_content_category'].isin(value_list)]

In [None]:
value_list = ['content_category_6',
 'content_category_7',
 'content_category_10',
 'content_category_11',
 'content_category_12']
df_2 = df6[df6['encoded_content_category'].isin(value_list)]

In [None]:
col = [df_1, df_2]

In [None]:
for x in col:
    fig, ax = plt.subplots()
    sns.lineplot(data=x, x='Date', y='frequency_of_content_category', hue='encoded_content_category', ax=ax, estimator='sum',ci = None)
    #ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%M"))
    fig.autofmt_xdate()

In [None]:
df6.groupby(['monthyear', 'encoded_content_category'])['frequency_of_content_category'].sum().unstack()

##  Content Message Concat Analysis for Product Skyrizi_KAM

In [None]:
df7 = grouped_data[grouped_data['product']=='Skyrizi_KAM']

In [None]:
df7

In [None]:
len(list(dict(df7['encoded_content_category'].value_counts()).keys()))

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df7, x='Date', y='frequency_of_content_category',ci = None, hue='encoded_content_category', ax=ax, estimator='sum')
#ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%M"))
fig.autofmt_xdate()

In [None]:
df7.groupby(['monthyear', 'encoded_content_category'])['frequency_of_content_category'].sum().unstack()

##  Content Message Concat Analysis for Product Skyrizi_PS

In [None]:
df8 = grouped_data[grouped_data['product']=='Skyrizi_PS']

In [None]:
df8

In [None]:
len(list(dict(df8['encoded_content_category'].value_counts()).keys()))

In [None]:
dict(df8['encoded_content_category'].value_counts())

In [None]:
value_list = ['content_category_1',
 'content_category_2',
 'content_category_3',
 'content_category_4',
 'content_category_5']
df_1 = df8[df8['encoded_content_category'].isin(value_list)]

In [None]:
value_list = ['content_category_6',
 'content_category_7',
 'content_category_10',
 'content_category_11',
 'content_category_12']
df_2 = df8[df8['encoded_content_category'].isin(value_list)]

In [None]:
col = [df_1, df_2]

In [None]:
for x in col:
    fig, ax = plt.subplots()
    sns.lineplot(data=x, x='Date', y='frequency_of_content_category', hue='encoded_content_category', ax=ax, estimator='sum',ci = None)
    #ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%M"))
    fig.autofmt_xdate()

### Tactic and Actual Message

In [None]:
pd.DataFrame(mapper.items(), columns=['content_category', 'encoded_content_category'])