# Cherry Picking the Data
## Anton Lipkanou
### For solely personal usage

In [None]:
# Part A. Function, creating the random sample of 2 random variables, based on the normal distribution
generate <- function(n){
  m1 <- 5 #setting means
  m2 <- 10
  s1 <- 5 #setting variances
  s2 <- 1
  X1 <- 0 #setting desired correlation
  set.seed(n) #for ability to repeat on different computers, we do set the seed thus will be changing it
  dat <- MASS::mvrnorm(256, mu = c(m1, m2), Sigma = matrix(c(s1, X1, X1, s2), ncol = 2, byrow = TRUE), empirical = TRUE) #creating the variables
  return(dat)
}

#part B. Function, comparing correlations between x1 and x2, for last 60/61/.../256 numbers
compare <- function(arr) {
  totalCorr = cor(arr[,1], arr[,2]); #correlation for the total sample
  maxCorr <- 0; 
  tempCorr <- 0; #finding the maximum correlation
  for (n in 60:256) {
    tempCorr <- cor(arr[1:n, 1], arr[1:n, 2]);
    if (tempCorr > maxCorr){
      maxCorr <- tempCorr
    }
  }
  output <- c(totalCorr, maxCorr)
  return (output)
}

#Part C. Function, repeating part B for 1,000 different samples and storing correlations. 
multiply <- function() {
  n <- 122; #for seed
  totalCorr <- c(); #dummy vars for storing the correlations
  maxCorr <- c();
  for (i in 1:1000){ #the main loop - generating 1,000 samples, calculating their correlations, appending to the vectors
    n <- n + 1;
    arr <- generate(n);
    corr <- compare(arr);
    totalCorr <- c(totalCorr, corr[1]);
    maxCorr <- c(maxCorr, corr[2]);
  }
  output <- matrix(c(totalCorr, maxCorr), nrow = length(totalCorr)); #summarizing two vectors into the matrix. First column - total corr, second column - max corr
  return(output)
}

#Part C1. Graphing the correlations.
graph <- function(mat) {
  hist(data[,1], main="Total Correlation", xlab="Total Correlation")
  hist(data[,2], main="Maximum Correlation", xlab="Maximum Correlation")
}

#Final part - finding max possible corr within the sample, with the p-value of ttest being the target variable
final <- function(mat) {
  minP <- 1;
  maxI <- 0
  for (i in 2:1000) {
    meanT <- t.test(data[1:i,2], mu=0.847, alternative = "greater")
    meanP <- meanT$p.value
    if(meanP < minP) {
      minP <- meanP;
      maxI <- i;
    }
  }
  return(c(minP, maxI))
}

In [None]:
#main function
data <- multiply(); #creating the initial dataset
graph(data) #graphing the histogram
ttest(data)#running the one-sample ttest

In [None]:
t.test(data[,1])
t.test(data[,2])
tTestValuesMAxCorr <- t.test(data[,2]); #running the ttest to determine the possibility for the correlation to be higher than 0.847
print("P-value, I ")
final(data)