In [None]:
from scipy.stats import norm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.integrate import quad
import scipy
import random
from scipy.optimize import curve_fit

In [None]:


def area_diff2(distri_calc, values_cumsum):

  '''
  chi square difference

  distri_calc is Y
  values_cumsum is observed
  (O - E)**2)/O
  
  '''

  diff = []
  for i in range(len(distri_calc)):
    diff.append(  ( (values_cumsum[i] - distri_calc[i] ) **2) /(values_cumsum[i]+1)) 

  return np.round(sum(diff), 4)

In [None]:
def model(hype_word , pmid_bef_percentile):


  # all_ans = {}

  '''
  pmid_bef_percentile is a dictionary with abstract values for the hype_word
  for, e.g. pmid_bef_percentile['novel'][:4]

  We aim to fit actual percentile positions of the hype word to a summation of 
  3 models: firstarea, noise, secondarea

  wherein, firstarea and secondarea are Sigmoid models and the noise is a constant model

  qq, ww are range of values on the x axis where the models would start/end
  i,j area range of values for the model signifying percent model used

  similarly, c3 would result for the noise part
  '''

  each = hype_word
  random.seed(42)


  plt.figure()
  hype_word = str(each)
  _area = sns.histplot([y for x,y in pmid_bef_percentile[hype_word]], bins=30)
  values_sns = [h.get_height() for h in _area.patches]
  bins_sns = [h.get_width() for h in _area.patches]
  bins_cum = (np.cumsum(bins_sns))
  values_ = [ np.round(x, 3) for x in values_sns]

  plt.close()

  x_bins = bins_cum - (bins_cum[0]/2)
  p = [(x/sum(values_)) for x in values_]

  p = [0.0001 if x==0 else x for x in p]

  c1 = np.linspace(0.05, 0.9, 50)
  c2 = np.linspace(0.05, 0.9, 50)


  minv = float('inf')
  parameters = []
  diffval = float('inf')
  newY = []
  newqq = 0
  newww = 0
  Y = 0

  for qq in range(13, 20):
    for ww in range(qq+1, 22):
      for i in c1:
        for j in c2:

          i = np.round(i, 3)
          j = np.round(j, 3)

          if (i+j>1):
            continue
          c3 = 1-i-j
          if c3<0:
            continue


          c3 = 1-i-j

          def function_sig1(x, a,b):

            firstarea = i*(1 / (1 + np.exp(-a*x -b)))
            noise = (c3)

            return firstarea + noise

          def function_sig2(x, c, d):
            secondarea = j*(1 / (1 + np.exp(-c*(x-1) - d)))
            noise = (c3)

            return secondarea+noise

          p1 = [0.5, 0]
          p2 = [ 0.5, 0]

          popti1, _ = curve_fit(function_sig1, x_bins[1:qq], p[1:qq], p1, maxfev = 5000)
          popti2, _ = curve_fit(function_sig2, x_bins[ww:-1], p[ww:-1], p2, maxfev = 5000)

          first_a = function_sig1(x_bins[1:qq], *popti1)
          second_a = function_sig2(x_bins[ww:-1], *popti2)
          noise = c3*np.ones(ww-qq)


          if (2*noise[0]< min(first_a)) or (2*noise[0]< min(second_a)):
            
            continue

          Y = np.concatenate((first_a, noise, second_a))

          Y = np.multiply(Y, sum(values_[1:-1]))
          Y = np.round(Y,2)
          diffval = (area_diff2(Y, values_[1:-1]))
          diffval = np.round(diffval, 2)

          if any(ll<0 for ll in Y):
            print('Error')

          if diffval<minv:

            #saving best fit values based on minimum least square difference


            firstarr = np.multiply((first_a - noise[0]), sum(values_[1:-1]))
            secondarr = np.multiply((second_a - noise[0]), sum(values_[1:-1]))
            noisearr = np.multiply( np.array([1]*28)*noise[0], sum(values_[1:-1]))

            newqq = qq
            newww = ww
            newY = Y
            minv = diffval
            parameters = [i, j, popti1, popti2]


  funcx = (hype_word, values_, parameters, newqq, newww, newY, minv, firstarr, secondarr, noisearr)

  # all_ans[hype_word] = (funcx)


  return funcx
  

